# 2025 DL Lab5: Object Detection on Pascal VOC

**Your Answer:**    
Hi I'm 邱照元, 314834001

In [None]:
import os
from google.colab import userdata

os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')
# !kaggle competitions download -c lab-5-object-detection-on-pascal-voc-639401
# !unzip -q lab-5-object-detection-on-pascal-voc-639401

In [None]:
# # latest repo
# import shutil

# REPO_URL = "https://github.com/zouyuoz/Lab5"
# REPO_NAME = "Lab5"
# TARGET_DIR = "/content"
# CLONE_DIR = os.path.join(TARGET_DIR, REPO_NAME)

# !git clone {REPO_URL} {CLONE_DIR}
# !rsync -aq --remove-source-files {CLONE_DIR}/ {TARGET_DIR}/
# !rm -rf {CLONE_DIR}
# os.chdir(TARGET_DIR)
# print(f"\n✅ 專案設定完成。目前工作目錄：{os.getcwd()}")

In [None]:
import os
import shutil

REPO_URL = "https://github.com/zouyuoz/Lab5"
REPO_NAME = "Lab5"
TARGET_DIR = "/content"
CLONE_DIR = os.path.join(TARGET_DIR, REPO_NAME)
TARGET_SHA = "29107facfe63a579f482839124d599bfd615bb5e"

!git clone {REPO_URL} {CLONE_DIR}
%cd {CLONE_DIR}
!git checkout {TARGET_SHA}

!rsync -aq --remove-source-files . {TARGET_DIR}/
%cd {TARGET_DIR} # 切換回 TARGET_DIR
!rm -rf {CLONE_DIR}
os.chdir(TARGET_DIR)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
import os
from torch.amp import autocast, GradScaler
from src.yolo import getODmodel
from yolo_loss import YOLOv3Loss
from yolo_loss_ciou_focal import YOLOv3Loss_CIoU_Focal
from src.dataset import VocDetectorDataset, train_data_pipelines, test_data_pipelines, collate_fn
from src.eval_voc import evaluate
from src.config import GRID_SIZES, ANCHORS
from torch.optim.lr_scheduler import CosineAnnealingLR

In [None]:
##### hyperparameters #####
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
num_epochs = 50
batch_size = 64
learning_rate = 1e-3
lambda_coord=5.0
lambda_obj=1.0
lambda_noobj=0.5
lambda_class=1.0

In [None]:
# Data paths
file_root_train = './dataset/image/'
annotation_file_train = './dataset/vocall_train.txt'
file_root_val = './dataset/image/'
annotation_file_val = './dataset/vocall_val.txt'

# Create datasets
print('Loading datasets...')
train_dataset = VocDetectorDataset(
    root_img_dir=file_root_train, dataset_file=annotation_file_train, train=True,
    transform=train_data_pipelines, grid_sizes=GRID_SIZES, encode_target=True
)
val_dataset = VocDetectorDataset(
    root_img_dir=file_root_val, dataset_file=annotation_file_val, train=False,
    transform=test_data_pipelines, grid_sizes=GRID_SIZES, encode_target=True,
)
eval_dataset = VocDetectorDataset(
    root_img_dir=file_root_val, dataset_file=annotation_file_val, train=False,
    transform=test_data_pipelines, grid_sizes=GRID_SIZES, encode_target=False,
)
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True, num_workers=4,)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False, num_workers=4,)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False, num_workers=4)

print(f'Loaded {len(train_dataset)} train images')
print(f'Loaded {len(val_dataset)} val images')

Loading datasets...
Initializing dataset
Loaded 8218 train images
Initializing dataset
Initializing dataset
Loaded 3823 val images




## Initialization

In [None]:
load_network_path = None #'checkpoints/best_detector.pth'
pretrained = True
model = getODmodel(pretrained=pretrained).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Some training utils, use mix precision if valid

In [None]:
# Create loss and optimizer
# criterion = YOLOv3Loss(lambda_coord, lambda_obj, lambda_noobj, lambda_class, ANCHORS).to(device)
# criterion = YOLOv3Loss_CIoU_Focal(lambda_coord=lambda_coord, lambda_noobj=lambda_noobj, anchors=ANCHORS)
criterion = YOLOv3Loss_CIoU_Focal(anchors=ANCHORS)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=5e-4)
lr_scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=1e-6)
use_amp = torch.cuda.is_available()
scaler = GradScaler(enabled=use_amp)

### Training Loop

In [None]:
# Training loop
print('\nStarting training...')
torch.cuda.empty_cache()
best_val_loss = np.inf
best_map = 0.0
for epoch in range(num_epochs):
    model.train()
    print(f'\nStarting epoch {epoch + 1} / {num_epochs}')
    for i, (images, target) in enumerate(train_loader):
        # Move to device
        images = images.to(device)
        target = [t.to(device) for t in target]
        # Forward pass
        optimizer.zero_grad()
        with autocast("cuda", enabled=use_amp):
            pred = model(images)
            # pred and target are lists of each scales
            loss_dict = criterion(pred, target)
        # Backward pass with mixed precision support
        scaler.scale(loss_dict['total']).backward()

        if (i + 1) % 40 == 0:
          total_norm = 0.0
          for p in model.parameters():
            if p.grad is not None:
              total_norm += p.grad.data.norm(2).item()
          print(f"Grad norm: {total_norm:.2f}")

        scaler.step(optimizer)
        scaler.update()
        # Print progress
        if (i + 1) % 40 == 0:
            outstring = f'Epoch [{epoch+1}/{num_epochs}], Iter [{i+1}/{len(train_loader)}], Loss: '
            outstring += ', '.join(f"{key}={val :.3f}" for key, val in loss_dict.items())
            print(outstring)
    lr_scheduler.step()
    learning_rate = lr_scheduler.get_last_lr()[0]
    print(f'Learning Rate for this epoch: {learning_rate:.3e}')
    # Validation
    with torch.no_grad():
        val_loss = 0.0
        model.eval()
        for i, (images, target) in enumerate(val_loader):
            # Move to device
            images = images.to(device)
            target = [t.to(device) for t in target]
            # Forward pass
            pred = model(images)
            loss_dict = criterion(pred, target)
            val_loss += loss_dict['total'].item()

        val_loss /= len(val_loader)
        print(f'Validation Loss: {val_loss:.4f}')

    # Save best model
    if best_val_loss > val_loss:
        best_val_loss = val_loss
        print(f'Updating best val loss: {best_val_loss:.5f}')
        os.makedirs('checkpoints', exist_ok=True)
        torch.save(model.state_dict(), 'checkpoints/best_detector.pth')

    # Save checkpoint
    # if (epoch + 1) in [5, 10, 20, 30, 40]:
    #     torch.save(model.state_dict(), f'checkpoints/detector_epoch_{epoch+1}.pth')

    torch.save(model.state_dict(), 'checkpoints/detector.pth')

    # Evaluate on val set
    if (epoch + 1) % 5 == 0:
        print('\nEvaluating on validation set...')
        val_aps = evaluate(model, eval_loader)
        print(f'Epoch {epoch + 1}, mAP: {np.mean(val_aps):.4f}')
        if (np.mean(val_aps) > best_map):
            best_map = np.mean(val_aps)
            torch.save(model.state_dict(), 'checkpoints/best_map_detector.pth')


Starting training...

Starting epoch 1 / 50
Grad norm: 181455.63
Epoch [1/50], Iter [40/129], Loss: total=1.534, box=55.984, obj=7269.431, noobj=7240.567, cls=63.296, num_pos=176.000, num_neg=681232.000, B=64.000
Grad norm: 143394.76
Epoch [1/50], Iter [80/129], Loss: total=1.261, box=65.411, obj=3107.966, noobj=3070.416, cls=62.763, num_pos=217.000, num_neg=681191.000, B=64.000
Grad norm: 176582.50
Epoch [1/50], Iter [120/129], Loss: total=1.148, box=51.714, obj=2340.566, noobj=2316.039, cls=53.893, num_pos=177.000, num_neg=681231.000, B=64.000
Learning Rate for this epoch: 9.990e-04
Validation Loss: 1.1631
Updating best val loss: 1.16311

Starting epoch 2 / 50
Grad norm: 193757.36
Epoch [2/50], Iter [40/129], Loss: total=1.030, box=42.227, obj=1878.737, noobj=1855.515, cls=45.749, num_pos=163.000, num_neg=681245.000, B=64.000
Grad norm: 218832.86
Epoch [2/50], Iter [80/129], Loss: total=1.104, box=57.116, obj=1622.171, noobj=1593.565, cls=52.877, num_pos=193.000, num_neg=681215.000,

100%|██████████| 60/60 [00:40<00:00,  1.49it/s]


0.21962 AP of class aeroplane
0.14098 AP of class bicycle
0.16970 AP of class bird
0.00000 AP of class boat (no predictions for this class)
0.00000 AP of class bottle (no predictions for this class)
0.00000 AP of class bus (no predictions for this class)
0.20703 AP of class car
0.22023 AP of class cat
0.00196 AP of class chair
0.00000 AP of class cow (no predictions for this class)
0.20392 AP of class diningtable
0.07137 AP of class dog
0.20762 AP of class horse
0.09713 AP of class motorbike
0.30490 AP of class person
0.00000 AP of class pottedplant (no predictions for this class)
0.00000 AP of class sheep (no predictions for this class)
0.03791 AP of class sofa
0.31962 AP of class train
0.00000 AP of class tvmonitor (no predictions for this class)
--- MAP: 0.11010 ---
Epoch 5, mAP: 0.1101

Starting epoch 6 / 50
Grad norm: 181219.26
Epoch [6/50], Iter [40/129], Loss: total=0.784, box=34.836, obj=985.916, noobj=972.492, cls=32.253, num_pos=156.000, num_neg=681252.000, B=64.000
Grad norm

100%|██████████| 60/60 [00:41<00:00,  1.46it/s]


0.35209 AP of class aeroplane
0.25880 AP of class bicycle
0.21045 AP of class bird
0.03201 AP of class boat
0.00616 AP of class bottle
0.30020 AP of class bus
0.21223 AP of class car
0.44113 AP of class cat
0.09787 AP of class chair
0.00000 AP of class cow (no predictions for this class)
0.28367 AP of class diningtable
0.35949 AP of class dog
0.41803 AP of class horse
0.34979 AP of class motorbike
0.47557 AP of class person
0.00498 AP of class pottedplant
0.00000 AP of class sheep (no predictions for this class)
0.20057 AP of class sofa
0.44805 AP of class train
0.01639 AP of class tvmonitor
--- MAP: 0.22337 ---
Epoch 10, mAP: 0.2234

Starting epoch 11 / 50


KeyboardInterrupt: 

# Kaggle submission

### Predict Result

Predict the results based on testing set. Upload to [Kaggle](https://www.kaggle.com/t/3fd493e454a744bdacc7f2918f9a2605).

**How to upload**

1. Click the folder icon in the left hand side of Colab.
2. Right click "result.csv". Select "Download"
3. To kaggle. Click "Submit Predictions"
4. Upload the result.csv
5. System will automaticlaly calculate the accuracy of 50% dataset and publish this result to leaderboard.


In [None]:
!python predict_test.py --weights checkpoints/best_map_detector.pth

In [None]:
MESSAGE = (""
    +f"num_epochs={num_epochs}\n"
    +f"max_lr={learning_rate}\n"
    +f"batch_size={batch_size}\n"
)
!kaggle competitions submit -c lab-5-object-detection-on-pascal-voc-639401 -f result.csv -m "{MESSAGE}"

100% 591k/591k [00:00<00:00, 911kB/s]
Successfully submitted to Lab5 Object Detection on Pascal VOC (639401)