In [1]:
import kagglehub
path = kagglehub.dataset_download("gopalbhattrai/pascal-voc-2012-dataset")

Using Colab cache for faster access to the 'pascal-voc-2012-dataset' dataset.


In [2]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.252-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.18 (from ultralytics)
  Downloading ultralytics_thop-2.0.18-py3-none-any.whl.metadata (14 kB)
Downloading ultralytics-8.3.252-py3-none-any.whl (1.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.2/1.2 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.18-py3-none-any.whl (28 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.252 ultralytics-thop-2.0.18


In [3]:
import os
import glob
import random
import shutil
import xml.etree.ElementTree as ET
from PIL import Image
from tqdm.notebook import tqdm

found_path = None
for root, dirs, files in os.walk("/kaggle/input"):
    if "JPEGImages" in dirs and "Annotations" in dirs:
        found_path = root
        break
BASE_PATH = found_path if found_path else "/kaggle/input/pascal-voc-2012-dataset/VOC2012_train_val/VOC2012_train_val"

IMAGES_PATH = os.path.join(BASE_PATH, "JPEGImages")
XML_PATH = os.path.join(BASE_PATH, "Annotations")

OUTPUT_DIR = "/kaggle/working/yolo_dataset"
TARGET_CLASSES = ["person", "cat", "dog", "bird", "tvmonitor"]
CLASS_MAP = {name: i for i, name in enumerate(TARGET_CLASSES)}

if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)

for split in ['train', 'val']:
    os.makedirs(os.path.join(OUTPUT_DIR, 'images', split), exist_ok=True)
    os.makedirs(os.path.join(OUTPUT_DIR, 'labels', split), exist_ok=True)

def convert_bbox(size, box):
    dw = 1. / size[0]
    dh = 1. / size[1]
    x = (box[0] + box[1]) / 2.0
    y = (box[2] + box[3]) / 2.0
    w = box[1] - box[0]
    h = box[3] - box[2]
    return (x * dw, y * dh, w * dw, h * dh)

def parse_xml(xml_file):
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        w = int(root.find('size').find('width').text)
        h = int(root.find('size').find('height').text)
        if w == 0 or h == 0: return None

        labels = []
        for obj in root.iter('object'):
            cls = obj.find('name').text
            if cls in TARGET_CLASSES:
                b = obj.find('bndbox')
                bb = convert_bbox((w, h), (
                    float(b.find('xmin').text), float(b.find('xmax').text),
                    float(b.find('ymin').text), float(b.find('ymax').text)
                ))
                labels.append(f"{CLASS_MAP[cls]} {bb[0]} {bb[1]} {bb[2]} {bb[3]}")
        return labels
    except:
        return None

xml_files = glob.glob(os.path.join(XML_PATH, "*.xml"))
data_pairs = []

print("Parsing XML files...")
for xml in tqdm(xml_files):
    labels = parse_xml(xml)
    if labels:
        base = os.path.basename(xml).replace('.xml', '')
        src = os.path.join(IMAGES_PATH, base + ".jpg")
        if os.path.exists(src):
            data_pairs.append((base, src, labels))

random.seed(42)
random.shuffle(data_pairs)
split_idx = int(len(data_pairs) * 0.8)
train_pairs = data_pairs[:split_idx]
val_pairs = data_pairs[split_idx:]

def copy_files(pairs, split):
    img_dir = os.path.join(OUTPUT_DIR, 'images', split)
    lbl_dir = os.path.join(OUTPUT_DIR, 'labels', split)

    for base, src, labels in tqdm(pairs, desc=f"Copying {split}"):
        dst_img = os.path.join(img_dir, base + ".jpg")
        dst_lbl = os.path.join(lbl_dir, base + ".txt")

        shutil.copy2(src, dst_img)
        with open(dst_lbl, 'w') as f:
            f.write('\n'.join(labels))

copy_files(train_pairs, 'train')
copy_files(val_pairs, 'val')

print("\n Starting Verification Scan...")
removed = 0

for split in ['train', 'val']:
    img_dir = os.path.join(OUTPUT_DIR, 'images', split)
    lbl_dir = os.path.join(OUTPUT_DIR, 'labels', split)

    for img_file in tqdm(glob.glob(os.path.join(img_dir, "*.jpg")), desc=f"Verifying {split}"):
        valid = False
        try:
            with Image.open(img_file) as img:
                img.verify()
            valid = True
        except Exception:
            valid = False

        if not valid:
            os.remove(img_file)
            base = os.path.basename(img_file).replace('.jpg', '.txt')
            lbl_path = os.path.join(lbl_dir, base)
            if os.path.exists(lbl_path):
                os.remove(lbl_path)
            removed += 1

print(f"\nVerification Complete. Removed {removed} corrupted images.")

Parsing XML files...


  0%|          | 0/17125 [00:00<?, ?it/s]

Copying train:   0%|          | 0/10196 [00:00<?, ?it/s]

Copying val:   0%|          | 0/2549 [00:00<?, ?it/s]


 Starting Verification Scan...


Verifying train:   0%|          | 0/10196 [00:00<?, ?it/s]

Verifying val:   0%|          | 0/2549 [00:00<?, ?it/s]


Verification Complete. Removed 0 corrupted images.


In [4]:
yaml_content = f"""
path: /kaggle/working/yolo_dataset
train: images/train
val: images/val

nc: {len(TARGET_CLASSES)}
names: {TARGET_CLASSES}
"""

with open("/kaggle/working/custom_voc.yaml", "w") as f:
    f.write(yaml_content)

In [None]:
from ultralytics import YOLO

model = YOLO("yolov8n.yaml")

results = model.train(
    data="/kaggle/working/custom_voc.yaml",
    epochs=50,
    imgsz=640,
    batch=16,
    name="yolov8n_voc_scratch",
    exist_ok=True,
    amp=False,
    workers=0,
    val=True
)
model.save("custom_yolov8n_scratch_final.pt")

Ultralytics 8.3.252 üöÄ Python-3.12.12 torch-2.9.0+cu126 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=False, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=/kaggle/working/custom_voc.yaml, degrees=0.0, deterministic=True, device=None, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=50, erasing=0.4, exist_ok=True, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolov8n.yaml, momentum=0.937, mosaic=1.0, multi_scale=False, name=yolov8n_voc_scratch, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patience=100, 