# Faster R-CNN训练部分
在接下来的部分中，我们对Faster R-CNN进行训练。为了减少训练时间，我们在预训练模型的基础上进行训练。

In [None]:
from modelarts.session import Session
sess = Session()
sess.download_data(bucket_path="/modelarts-labs/notebook/DL_object_detection_faster/frcnn.tar", path="./faster.tar")
!tar -xf ./faster.tar
!rm -r ./faster.tar

#### 安装与引用

In [1]:
!pip install pycocotools
!pip install torchvision==0.3
!pip uninstall -y protobuf
!pip install protobuf

Looking in indexes: http://repo.myhuaweicloud.com/repository/pypi/simple
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Looking in indexes: http://repo.myhuaweicloud.com/repository/pypi/simple
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Uninstalling protobuf-3.9.1:
  Successfully uninstalled protobuf-3.9.1
Looking in indexes: http://repo.myhuaweicloud.com/repository/pypi/simple
Collecting protobuf
[?25l  Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/eb/f4/a27952733796330cd17c17ea1f974459f5fefbbad119c0f296a6d807fec3/protobuf-3.9.1-cp36-cp36m-manylinux1_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 59.8MB/s eta 0:00:01
Installing collected packages: protobuf
Successfully installed protobuf-3.9.1
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import tools._init_paths

import tensorboardX as tb
from datasets.factory import get_imdb
from model.train_val import get_training_roidb, train_net
from model.config import cfg, cfg_from_file, cfg_from_list, get_output_dir, get_output_tb_dir

import roi_data_layer.roidb as rdl_roidb
from roi_data_layer.layer import RoIDataLayer
import utils.timer

import pickle

import torch
import torch.optim as optim
from nets.vgg16 import vgg16

import numpy as np
import os
import sys
import glob
import time

#### 模型训练中的参数设置

In [3]:
imdb_name = "voc_2007_trainval"
imdbval_name = "voc_2007_test"
# 使用的预训练模型位置
weight = "../data/imagenet_weights/vgg16.pth"
# 迭代次数
max_iters = 1200
# cfg模型文件位置
cfg_file = None
set_cfgs = None

#### 定义加载数据集函数

In [4]:
def combined_roidb(imdb_names):
    
    def get_roidb(imdb_name):
        # 加载数据集
        imdb = get_imdb(imdb_name)
        print('Loaded dataset `{:s}` for training'.format(imdb.name))
        # 使用ground truth作为数据集策略
        imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD)
        print('Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD))
        roidb = get_training_roidb(imdb)
        return roidb

    roidbs = [get_roidb(s) for s in imdb_names.split('+')]
    roidb = roidbs[0]
    if len(roidbs) > 1:
        for r in roidbs[1:]:
            roidb.extend(r)
        tmp = get_imdb(imdb_names.split('+')[1])
        imdb = datasets.imdb.imdb(imdb_names, tmp.classes)
    else:
        imdb = get_imdb(imdb_names)
    return imdb, roidb

#### 训练过程

In [5]:
if cfg_file is not None:
    cfg_from_file(cfg_file)
if set_cfgs is not None:
    cfg_from_list(set_cfgs)

print('Using config:')
print(cfg)

np.random.seed(cfg.RNG_SEED)

# 加载训练数据集
imdb, roidb = combined_roidb(imdb_name)
print('{:d} roidb entries'.format(len(roidb)))

# 设置输出路径
output_dir = get_output_dir(imdb,None)
print('Output will be saved to `{:s}`'.format(output_dir))

# 设置日志保存路径
tb_dir = get_output_tb_dir(imdb, None)
print('TensorFlow summaries will be saved to `{:s}`'.format(tb_dir))

# 加载验证数据集
orgflip = cfg.TRAIN.USE_FLIPPED
cfg.TRAIN.USE_FLIPPED = False
_, valroidb = combined_roidb(imdbval_name)
print('{:d} validation roidb entries'.format(len(valroidb)))
cfg.TRAIN.USE_FLIPPED = orgflip

# 创建backbone网络
# 在案例中使用的是VGG16模型，可以尝试其他不同的模型结构，例如Resnet等
net = vgg16()


Using config:
{'TRAIN': {'LEARNING_RATE': 0.001, 'MOMENTUM': 0.9, 'WEIGHT_DECAY': 0.0001, 'GAMMA': 0.1, 'STEPSIZE': [30000], 'DISPLAY': 10, 'DOUBLE_BIAS': True, 'TRUNCATED': False, 'BIAS_DECAY': False, 'USE_GT': False, 'ASPECT_GROUPING': False, 'SNAPSHOT_KEPT': 3, 'SUMMARY_INTERVAL': 180, 'SCALES': [600], 'MAX_SIZE': 1000, 'IMS_PER_BATCH': 1, 'BATCH_SIZE': 128, 'FG_FRACTION': 0.25, 'FG_THRESH': 0.5, 'BG_THRESH_HI': 0.5, 'BG_THRESH_LO': 0.1, 'USE_FLIPPED': True, 'BBOX_REG': True, 'BBOX_THRESH': 0.5, 'SNAPSHOT_ITERS': 5000, 'SNAPSHOT_PREFIX': 'res101_faster_rcnn', 'BBOX_NORMALIZE_TARGETS': True, 'BBOX_INSIDE_WEIGHTS': [1.0, 1.0, 1.0, 1.0], 'BBOX_NORMALIZE_TARGETS_PRECOMPUTED': True, 'BBOX_NORMALIZE_MEANS': [0.0, 0.0, 0.0, 0.0], 'BBOX_NORMALIZE_STDS': [0.1, 0.1, 0.2, 0.2], 'PROPOSAL_METHOD': 'gt', 'HAS_RPN': True, 'RPN_POSITIVE_OVERLAP': 0.7, 'RPN_NEGATIVE_OVERLAP': 0.3, 'RPN_CLOBBER_POSITIVES': False, 'RPN_FG_FRACTION': 0.5, 'RPN_BATCHSIZE': 256, 'RPN_NMS_THRESH': 0.7, 'RPN_PRE_NMS_TOP_N

In [6]:
from model.train_val import filter_roidb, SolverWrapper
# 对ROI进行筛选，将无效的ROI数据筛选掉
roidb = filter_roidb(roidb)
valroidb = filter_roidb(valroidb)

sw = SolverWrapper(
    net,
    imdb,
    roidb,
    valroidb,
    output_dir,
    tb_dir,
    pretrained_model=weight)

print('Solving...')

Filtered 0 roidb entries: 10022 -> 10022
Filtered 0 roidb entries: 4952 -> 4952
Solving...


In [7]:
# 显示所有模型属性
sw.__dict__.keys()

dict_keys(['net', 'imdb', 'roidb', 'valroidb', 'output_dir', 'tbdir', 'tbvaldir', 'pretrained_model'])

In [8]:
# 此时的sw.net为backbone
print(sw.net)

vgg16()


In [9]:
# 构建网络结构，模型加入ROI数据层
sw.data_layer = RoIDataLayer(sw.roidb, sw.imdb.num_classes)
sw.data_layer_val = RoIDataLayer(sw.valroidb, sw.imdb.num_classes, random=True)

# 构建网络结构，在VGG16基础上加入ROI和Classifier部分
lr, train_op = sw.construct_graph()

# 加载之前的snapshot
lsf, nfiles, sfiles = sw.find_previous()

# snapshot 为训练提供了断点训练，如果有snapshot将加载进来，继续训练
if lsf == 0:
    lr, last_snapshot_iter, stepsizes, np_paths, ss_paths = sw.initialize(
    )
else:
    lr, last_snapshot_iter, stepsizes, np_paths, ss_paths = sw.restore(
        str(sfiles[-1]), str(nfiles[-1]))
iter = last_snapshot_iter + 1
last_summary_time = time.time()
# 在之前的训练基础上继续进行训练
stepsizes.append(max_iters)
stepsizes.reverse()
next_stepsize = stepsizes.pop()
# 对net切换成训练模式
print("网络结构")
sw.net.train()
sw.net.to(sw.net._device)

Restoring model snapshots from /home/ma-user/work/output/default/voc_2007_trainval/default/res101_faster_rcnn_iter_1000.pth
Restored.
网络结构


vgg16(
  (vgg): VGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inplace)
      (16): MaxPool2d(kernel_size=2, stride=2, padd

In [10]:
# 开始训练

while iter < max_iters + 1:
    cfg.SNAPSHOT_PREFIX = "VGG_faster_rcnn"
    if iter == next_stepsize + 1:
        # 加入snapshot节点
        sw.snapshot(iter)
        lr *= cfg.TRAIN.GAMMA
        scale_lr(sw.optimizer, cfg.TRAIN.GAMMA)
        next_stepsize = stepsizes.pop()

    utils.timer.timer.tic()
    # 数据通过ROI数据层，进行前向计算
    blobs = sw.data_layer.forward()

    now = time.time()
    if iter == 1 or now - last_summary_time > cfg.TRAIN.SUMMARY_INTERVAL:
        # 计算loss函数
        # 根据loss函数对模型进行训练
        rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, total_loss, summary = \
          sw.net.train_step_with_summary(blobs, sw.optimizer)
        for _sum in summary:
            sw.writer.add_summary(_sum, float(iter))
        # 进行数据层验证计算
        blobs_val = sw.data_layer_val.forward()
        summary_val = sw.net.get_summary(blobs_val)
        for _sum in summary_val:
            sw.valwriter.add_summary(_sum, float(iter))
        last_summary_time = now
    else:
        rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, total_loss = \
          sw.net.train_step(blobs, sw.optimizer)
    utils.timer.timer.toc()

    if iter % (cfg.TRAIN.DISPLAY) == 0:
        print('iter: %d / %d, total loss: %.6f\n >>> rpn_loss_cls: %.6f\n '
              '>>> rpn_loss_box: %.6f\n >>> loss_cls: %.6f\n >>> loss_box: %.6f\n >>> lr: %f' % \
              (iter, max_iters, total_loss, rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, lr))
        print('speed: {:.3f}s / iter'.format(
            utils.timer.timer.average_time()))

    # 进行snapshot存储
    if iter % cfg.TRAIN.SNAPSHOT_ITERS == 0:
        last_snapshot_iter = iter
        ss_path, np_path = sw.snapshot(iter)
        np_paths.append(np_path)
        ss_paths.append(ss_path)

        # 删掉多余的snapshot
        if len(np_paths) > cfg.TRAIN.SNAPSHOT_KEPT:
            sw.remove_snapshot(np_paths, ss_paths)

    iter += 1

if last_snapshot_iter != iter - 1:
    sw.snapshot(iter - 1)

sw.writer.close()
sw.valwriter.close()

iter: 1010 / 1200, total loss: 1.111544
 >>> rpn_loss_cls: 0.132086
 >>> rpn_loss_box: 0.047608
 >>> loss_cls: 0.371774
 >>> loss_box: 0.560076
 >>> lr: 0.001000
speed: 0.129s / iter
iter: 1020 / 1200, total loss: 1.315273
 >>> rpn_loss_cls: 0.108653
 >>> rpn_loss_box: 0.052084
 >>> loss_cls: 0.648501
 >>> loss_box: 0.506036
 >>> lr: 0.001000
speed: 0.127s / iter
iter: 1030 / 1200, total loss: 0.584790
 >>> rpn_loss_cls: 0.130646
 >>> rpn_loss_box: 0.007096
 >>> loss_cls: 0.303233
 >>> loss_box: 0.143815
 >>> lr: 0.001000
speed: 0.128s / iter
iter: 1040 / 1200, total loss: 1.648899
 >>> rpn_loss_cls: 0.367199
 >>> rpn_loss_box: 0.325622
 >>> loss_cls: 0.578676
 >>> loss_box: 0.377402
 >>> lr: 0.001000
speed: 0.128s / iter
iter: 1050 / 1200, total loss: 1.302446
 >>> rpn_loss_cls: 0.066994
 >>> rpn_loss_box: 0.023159
 >>> loss_cls: 0.408805
 >>> loss_box: 0.803488
 >>> lr: 0.001000
speed: 0.128s / iter
iter: 1060 / 1200, total loss: 1.470572
 >>> rpn_loss_cls: 0.179594
 >>> rpn_loss_box