In [2]:
import argparse
import os
import random
import time
import numpy as np
import torch
import torch.backends.cudnn as cudnn
os.chdir('../')  # 更改notebook的工作路径到上一级目录

In [3]:
from dataset.VOC_dataset import VOCDataset
from dataset.augment import Transforms
from model.fcos import FCOSDetector

ModuleNotFoundError: No module named 'dataset'

## 参数设定

In [3]:
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", type=int, default=24, help="number of epochs")
parser.add_argument("--batch_size", type=int, default=4, help="size of each image batch")
parser.add_argument("--n_cpu", type=int, default=4, help="number of cpu threads to use during batch generation")
parser.add_argument("--n_gpu", type=str, default='0', help="number of cpu threads to use during batch generation")
opt = parser.parse_args([])  # notebook 中运行的时候要加 parser.parse_args() 的参数要加[]

## GPU环境设定

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = opt.n_gpu
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(0)

## 数据集设置

In [5]:
BATCH_SIZE = opt.batch_size

transform = Transforms()
train_dataset = VOCDataset(root_dir='../datasets/VOCdevkit/VOC2012', resize_size=[640, 800],
                           split='trainval', use_difficult=False, is_train=True, augment=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                                           collate_fn=train_dataset.collate_fn,
                                           num_workers=opt.n_cpu, worker_init_fn=np.random.seed(0))

print("total_images : {}".format(len(train_dataset)))

INFO=====>voc dataset init finished  ! !
total_images : 5011


## 模型设置

In [1]:
model = FCOSDetector(mode="training").cuda()
# model = torch.nn.DataParallel(model)  # 多gpu时使用
model.train()

NameError: name 'FCOSDetector' is not defined

## 训练参数设置

In [None]:
EPOCHS = opt.epochs # 总迭代数
steps_per_epoch = len(train_dataset) // BATCH_SIZE # 每轮迭代的steps数
TOTAL_STEPS = steps_per_epoch * EPOCHS # 总steps数量
WARMPUP_STEPS = 501 # warm up的steps数量
output_dir = 'training_dir' # 设置保存训练模型的路径
if not os.path.exists(output_dir): # 如果该路径不存在，则创建一个
    os.mkdir(output_dir)

GLOBAL_STEPS = 1 # 计数器初始化
LR_INIT = 1e-3  # 初始学习率

## 优化器设置

In [7]:
optimizer = torch.optim.SGD(model.parameters(), lr=LR_INIT, momentum=0.9, weight_decay=0.0001)

## 开始训练

In [None]:
for epoch in range(EPOCHS):  # 分轮次，，，
    for epoch_step, data in enumerate(train_loader):  # ，，，分批次 开始训练
        
        # ============================== 拿到批次数据 =========================
        batch_imgs, batch_boxes, batch_classes = data
        batch_imgs = batch_imgs.cuda()
        batch_boxes = batch_boxes.cuda()
        batch_classes = batch_classes.cuda()
        # =====================================================================
        
        
        # ================================ 学习率调整 =========================
        # 在warm up阶段线性增加学习率
        if GLOBAL_STEPS < WARMPUP_STEPS:
            lr = float(GLOBAL_STEPS / WARMPUP_STEPS * LR_INIT)
            for param in optimizer.param_groups:
                param['lr'] = lr
        # 训练进展到百分之70%的时候降一次学习率
        if GLOBAL_STEPS == int(TOTAL_STEPS*0.7):
            lr = LR_INIT * 0.1
            for param in optimizer.param_groups:
                param['lr'] = lr
        # 训练进展到百分之90%的时候降一次学习率
        if GLOBAL_STEPS == int(TOTAL_STEPS*0.9):
            lr = LR_INIT * 0.01
            for param in optimizer.param_groups:
                param['lr'] = lr       
        # ===================================================================   
        
        # ============================ 网络参数更新 =========================
        start_time = time.time()
        # 1 梯度清理
        optimizer.zero_grad()
        # 2 损失计算
        losses = model([batch_imgs, batch_boxes, batch_classes])
        loss = losses[-1]
        # 损失先在batch内进行平均，再回传
        loss.mean().backward()
        # 3 梯度回传更新网络参数
        optimizer.step()
        # =================================================================
        
        
        # ============================ 显示训练信息 =========================
        end_time = time.time()
        cost_time = int((end_time - start_time) * 1000)
        if GLOBAL_STEPS%50 == 0:
            print(
                "global_steps:%d epoch:%d steps:%d/%d cls_loss:%.4f cnt_loss:%.4f reg_loss:%.4f cost_time:%dms lr=%.4e total_loss:%.4f" % \
                (GLOBAL_STEPS, epoch + 1, epoch_step + 1, steps_per_epoch, losses[0].mean(), losses[1].mean(),
                 losses[2].mean(), cost_time, lr, loss.mean()))

        GLOBAL_STEPS += 1
        # ==================================================================
    torch.save(model.state_dict(),
        os.path.join(output_dir, "model_{}.pth".format(epoch + 1)))

global_steps:1 epoch:1 steps:1/5011 cls_loss:1.2560 cnt_loss:0.7801 reg_loss:0.9997 cost_time:1514ms lr=3.9920e-06 total_loss:3.0358
global_steps:2 epoch:1 steps:2/5011 cls_loss:1.1750 cnt_loss:0.7514 reg_loss:0.9999 cost_time:438ms lr=7.9840e-06 total_loss:2.9263
global_steps:3 epoch:1 steps:3/5011 cls_loss:1.2949 cnt_loss:0.7793 reg_loss:0.9997 cost_time:418ms lr=1.1976e-05 total_loss:3.0740
global_steps:4 epoch:1 steps:4/5011 cls_loss:1.3756 cnt_loss:0.9157 reg_loss:1.0000 cost_time:400ms lr=1.5968e-05 total_loss:3.2913
global_steps:5 epoch:1 steps:5/5011 cls_loss:1.0905 cnt_loss:0.7375 reg_loss:0.9995 cost_time:400ms lr=1.9960e-05 total_loss:2.8276
global_steps:6 epoch:1 steps:6/5011 cls_loss:1.1989 cnt_loss:0.8049 reg_loss:1.0000 cost_time:456ms lr=2.3952e-05 total_loss:3.0039
global_steps:7 epoch:1 steps:7/5011 cls_loss:1.0979 cnt_loss:0.7145 reg_loss:1.0000 cost_time:392ms lr=2.7944e-05 total_loss:2.8123
global_steps:8 epoch:1 steps:8/5011 cls_loss:2.7303 cnt_loss:0.0000 reg_los