# model/utils/creator_tool.py文件 

这个脚本实现了三个Creator函数，分别是：

ProposalCreator

AnchorTargetCreator

ProposalTargetCreator

前两个都在RPN网络里实现，第三个在RoIHead网络里实现



In [None]:
class ProposalCreator:
  """
  对于每张图片，利用它的feature map 计算(H/16) * (W/16) * 9大概 20000个anchro属于前景的概率
  然后从中选取概率较大的12000张，利用位置回归参数，修正这12000个anchor的位置
  利用NMS 选出2000个ROIs以及对应的位置参数
  """

  def __init__(self,
               parent_model,
               n_train_pre_nms=12000,
               n_train_post_nms=2000,
               n_test_pre_nms=6000,
               n_test_post_nms=300,
               min_size=16):
    self.parent_model = parnet_model
    self.nms_thresh = nms.thresh
    self.n_train_pre_nms = n_train_pre_nms
    self.n_train_post_nms = n_train_post_nms
    self.n_test_pre_nms = n_test_pre_nms
    self.n_test_post_nms = n_test_post_nms
    self.min_size = min_size

  def __call__(self, loc, score,
               anchor, img_size, scale=1):
    # 这里的loc和score是经过RPN中经过1*1conv分类和回归得到的
    if self.parent_model.training: #如果是训练模式就用train 否则用test
      n_pre_nms = self.n_train_pre_nms #NMS之前有12000个
      n_post_nms = self.n_train_post_nms #NMS之后有2000个
    else:
      n_pre_nms = self.n_test_pre_nms # 6000->300
      n_post_nms = self.n_test_post_nms

    # 把anchor转成proposal 即rois
    roi = loc2bbox(anchor,loc)

    # Clip predicted boxes to images.
    roi[:, slice(0,4,2)] = np.clip(
        roi[:,slice(0,4,2)],0,img_size[0]) #裁剪将rois的ymin,ymax限定在[0,H]
    roi[:, slice(1,4,2)] = np.clip(
        roi[:,slice(1,4,2)],0,img_size[1]) #裁剪将rois的xmin,xmax限定在[0,W]

    #去除太小的预测框
    min_size = self.min_size * scale #16
    hs = roi[:,2] - roi[:,0] #rois的宽
    ws = roi[:,3] - roi[:,1] #rois的长
    keep = np.where((hs >= min_size) & (ws >= min_size))[0] #确保rois的长宽大于最小阈值
    roi = roi[keep,:] 
    score = score[keep] #对剩下的ROIs进行打分（根据RPN中ROIS的前景预测概率）

    # 对所有的proposal,score按打分大小 从大到小
    # 选择最前面的pre_nms_TopN个(6000)
    order = score.ravel().argsort()[::-1]
    if n_pre_nms > 0:
      order = order[:n_pre_nms]
    roi = roi[order,:]
    score = score[order]


    #使用NMS,选择after_nms_topN
    keep = nms(
        torch.from_nmpy(roi).cuda()
        torch.from_numpy(score).cuda()
        self.nms_thresh)
    if n_post_nms > 0:
      keep keep[:n_post_nms]
    roi = roi[keep.cpu().numpy()]
    return roi


In [None]:
class AnchorTargetCreator(object):
  """ 
  作用是生成训练要用的anchro（与对应框iou值最大或者最小的各128个框的坐标和256个label（0或者1））
  为Faster R-CNN专有的RPN网络提供自我训练的样本，RPN网络正是利用AnchorTargetCreator产生的样本数据进行训练
  这样产生的预测anchor的类别和位置才更加精准， anchor变成真正的ROIS需要进行位置矫正
  而AnchorTargetCreator产生的带标签的样本就是给RPN网络进行训练学习用的
  """
  def __init__(self,
               n_sample=256,
               pos_iou_thresh=0.7,neg_iou_thresh=0.3,
               pos_ratio=0.5):
    self.n_sample = n_sample
    self.pos_iou_thresh = pos_iou_thresh
    self.neg_iou_thresh = neg_iou_thresh
    self.pos_ratio = pos_ratio

  def __call__(self, bbox, anchor, img_size):

    img_H, img_W = img_size

    n_anchor = len(anchor) #一般对应20000个左右anchor
    inside_index = _get_inside_index(anchor, img_H, img_W) #将那些超出图片范围的anchor全部去掉，只保留位于图片上的
    anchor = anchor[inside_index] #保留位于图片内部的anchor
    argmax_ious, label = self._create_label(
        inside_index, anchor, bbox) #筛选出符合条件的正例128个 负例128个 并给它们附上对应的label
    # 计算每一个anchor与对应bbox求得iou最大的bbox计算偏移量（注意这里是位于图片内部的每一个）
    loc = bbox2loc(anchor, bbox[argmax_ious]) 
    # 将位于图片内部的框的label对应到所有生成的20000个框中（label原本为所有图片中的框的）
    label = _unmap(label, n_anchor, inside_index, fill=-1)
    # 将回归的框对应到所有生成的20000个框中（label原本为所有在图片中的框的）
    loc = _unmap(loc, n_anchor, inside_index, fill=0)

    return loc, label

  def _create_label(self, inside_index, anchor, bbox):
    # label: 1 is pos; 0 is neg; -1 is dont care
    label = np.empty((len(inside_index), dtype=np.int32))
    label.fill(-1) # 全部填充-1
    # 调用_calc_ious()函数得到每个anchor与哪个bbox的iou最大以及这个iou值、每个bbox与哪个anchor的iou最大
    argmax_ious, max_ious, gt_argmax_ious = \
      self._calc_ious(anchor, bbox, inside_index)

    # 把每个anchor与对应的框求得的iou值与负样本阈值比较 若小于负样本阈值，
    # 则label设为0， pos_iou_thresh=0.7 neg_iou_thresh=0.3
    label[max_ious < self.neg_iou_thresh] = 0

    # 把与每个bbox求得iou值最大的anchor的label设为1
    label[gt_argmax_ious] = 1

    # 把每个anchor与对应的框求得的iou值与正样本阈值比较，若大于正样本阈值，则label=1
    label[max_ious >= self.pos_iou_thresh] = 1

    # 按照比例计算出正样本数量，pos_ratio=0.5, n_sample=256
    n_pos = int(self.pos_ratio * self.n_sample)
    pos_index = np.where(label == 1)[0] # 得到所有正样本的索引
    if len(pos_index) > n_pos:
      disable_index = np.random.choice(
          pos_index, size = (len(pos_index) - n.pos),replace=False)
      label[disable_index] = -1 #如果选出来的正样本数多于预设的正样本数 则随机抛弃的设定成-1

    # 设定的负样本的数量
    n_neg = self.n_sample - np.sum(label == 1)
    neg_index = np.where(label == 0)
    if len(neg_index) > n_neg:
      disable_index = np.random.choice(
          neg_index, size = (len(neg_index) - n_neg),replace=False)
      label[disable_index] = -1 #随机选择不需要的负样本，个数为len(neg_index)-neg_index,labe设定为-1


    return argmax_ious, label

  def _calc_ious(self, anchor, bbox, inside_index):
    # 调用bbox_iou函数计算anchor与bbox的IOU, ious:(N,K) N为anchor中第N个，K为bbox中第K个
    ious = bbox_iou(anchor, bbox)
    argmax_ious = ious.argmax(axis=1)
    # 求出每个anchor与哪个bbox的IoU最大，以及最大值，max_ious:[1,N]
    max_ious = ious[np.arange(len(inside_index)),argmax_ious]
    gt_argmax_ious = ious.argmax(axis=0)
    # 求出每个bbox与哪个anchor的IoU最大，以及最大值，gt_max_ious:[1,K]
    gr_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]
    gt_argmax_ious = np.where(ious == gt_max_ious)[0] #然后返回最大iou的索引（每个bbox与哪个anchor的iou最大）

    return argmax_ious, max_ious, gt_argmax_ious

下面是ProposalTargetCreator的代码

目的：为2000个rois赋予ground truth 严格讲是挑出128个赋予ground truth

输入：2000个rois、一个batch（一张图）中所有的bbox ground truth (R,4)、对应bbox所包含的label(R,1) （VOC2007来说20类 0-19）

输出：128个sample roi(128,4)、128个gt_roi_loc(128,4)、128个gt_roi_label(128,1)


In [None]:
class ProposalCreator(object):
  """
  为2000个rois赋予ground truth
  输入：2000个rois、一个batch中所有的bbox ground truth（R，4）、对应bbox所包含的label（R，1）
  输出：128个sample roi（128，4）、128个gt_roi_loc(128,4)、128个gt_roi_label(128,1)
  """
  def __init__(self,
               n_sample=128,
               pos_ratio=0.25,pos_iou_thresh=0.5,
               neg_iou_thresh_hi=0.5, neg_iou_thresh_lo=0.0):
    self.n_sample = n_sample
    self.pos_ratio = pos_ratio
    self.pos_iou_thresh = pos_iou_thresh
    self.neg_iou_thresh_hi = neg_iou_thresh_hi
    self.neg_iou_thresh_lo = neg_iou_thresh_lo # NOTE:default 0.1 in py-faster-rcnn

  def __call__(self,roi,bbox,label,
               loc_normalize_mean=(0,0,0,0),
               loc_normalize_std=(0.1,0.2,0.2)):
    # 因为这些数据是要放入最后的大网络进行训练，比如说位置数据，所以要对其位置坐标进行数据增强处理 归一化
    n_bbox,_ = bbox.shape
    n_bbox,_ = bbox.shape

    roi = np.concatenate((roi,bbox), axis=0)# 首先将2000个roi和m个bbox给concatenate了一下成为新的roi
    # n_sample = 128, pos_ratio = 0.5, round 对传入的数据进行四舍五入
    pos_roi_per_image = np.round(self.n_sample * self.pos_ratio)
    iou = bbox_iou(roi, bbox) # 计算每一个roi与每一个bbox的iou
    # 按行找到最大值， 返回最大值对应的序号以及其真正的IoU
    gt_assignment = iou.argmax(axis=1)
    max_iou = iou.max(axis=1) #每个roi与对应bbox最大的iou
    # Offset range of classes from [0, n_fg_class-1] to [1, n_fg_class].
    # 0是背景
    gt_roi_label = label[gt_assignment] + 1 # 从1开始的类别序号，给每个类得到真正的label(将0-19变成1-20)

    # 根据iou的最大值将正负样本找出来 pos_iou_thresh=0.5
    pos_index = np.where(max_iou>=self.pos_iou_thresh)[0]
    # 需要保留的roi个数（满足大于pos_iou_thresh条件的roi与64之间较小的一个）
    pos_roi_per_this_image = int(main(pos_roi_per_image,pos_index.size))
    if pos_index.size > 0:
      pos_index = np.random.choice(
          pos_index,size=pos_roi_per_this_image,replace=False) #找出的样本数目过多就随机丢掉一些

    # 负样本的ROI区间[neg_iou_thresh_lo, neg_iou_thresh_hi]
    # neg_iou_thresh_hi = 0.5, neg_iou_thresh_li = 0.0
    neg_index = np.where((max_iou < self.neg_iou_thresh_hi)&
                (max_iou >= self.neg_iou_thresh_lo))[0]
    # 需要保留的roi个数 （满足大于0小于neg_iou_thresh_hi条件的roi与64之间较小的一个）
    neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image
    neg_roi_per_this_image = int(min(neg_roi_per_this_image,
                                     neg_index.size))
    if neg_index.size > 0:
      neg_index = np.random.choice(
          neg_index,size=neg_roi_per_this_image,replace=False) #找出的样本数目过多就随机丢掉一些

    # 综合下找到的正负样本的index
    keep_index = np.append(pos_index, neg_index)
    gt_roi_label = gt_roi_label[keep_index]
    gt_roi_label[pos_roi_per_this_image:] = 0 #负样本label 设为0
    sample_roi = roi[keep_index] 

    """
    那么此时输出的128*4的sample_roi就可以去扔到RoIHead网络里去进行分类与回归了
    同样，RoIHead网络利用这sample_roi + feature为输入，输出是分类（21类）和回归（进一步微调bbox）的预测值
    那么分类回归的ground truth就是ProposalTargetCreator输出的gt_roi_label和gt_roi_loc
    """
    gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]])
    gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32)
              ) / np.array(loc_normalize_std, np.float32))
    
    # ProposalTargetCreator首次用到了真实的21个类的label，且该类最后对loc进行了归一化处理，所以预测时要进行均值方差归一化处理

    return sample_roi, gt_roi_loc, gt_roi_label

# Trainer.py

In [None]:
class FasterRCNNTrainer(nn.Module):

  def __init__(self, faster_rcnn):
    super(FasterRCNNTrainer, self).__init__()

    self.faster_rcnn = faster_rcnn
    self.rpn_sigma = opt.rpn_sigma
    self.roi_sigma = opt.roi_sigma

    self.anchor_target_creator = AnchorTargetCreator()
    self.proposal_target_creator = ProposalCreator()
    self.loc_normalize_mean = faster_rcnn.loc_normalize_mean
    self.loc_normalize_std = faster_rcnn.loc_normalize_std

    self.optimizer = self.faster_rcnn.get_opetimizer()
    self.vis = Visualizer(env=opt.env)

    self.rpn_cm = ConfusionMeter(2)
    self.roi_cm = ConfusionMeter(21)
    self.meters = {k:AverageValueMeter() for k in LossTuple._fields}

  def forward(self,img,bboxes, labels,scale):
    n = bboxes.shape[0] # 获取batch个数
    if n!= 1:
      raise ValueError('Currently only batch size 1 is supported')
    _,_,H,W = img.shape
    img_size = (H,W)

    features = self.faster_rcnn.extractor(imgs) #vgg16 conv5_3之前的部分提取图片的特征
    # 通过RPN提取ROI相关的信息
    rpn_loc, rpn_scores, rois, roi_indices, anchor = \
      self.faster_rcnn.rpn(features, img_size, scale)

    bbox = bboxes[0]
    label = labels[0]
    rpn_score = rpn_scores[0]
    rpn_loc = rpn_locs[0]
    roi = rois
    # 调用proposal_target_creator函数生成sample_roi(128,4) / gt_roi_loc(128,4) / gt_roi_label(128,1)
    # RoIHead网络利用sample_roi + feature为输入 输出是分类 和 回归的预测值
    # 那么分类回归的ground truth就是ProposalTargetCreator输出的gt_roi_label和gt_roi_lic
    sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
      roi,
      at.tonumpy(bbox),
      at.tonumpy(label),
      self.loc_normalize_mean,
      self.loc_normalize_std)
    
    sample_roi_index = t.zeros(len(sample_roi))
    roi_cls_loc, roi_score = self.faster_rcnn.head(
        features,
        sample_roi,
        sample_roi_index)
 # ------------------ RPN losses -------------------#
    gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
        at.tonumpy(bbox),
        anchor,
        img_size) # 输入20000个anchor和bbox，调用anchor_target_creator函数得到2000个anchor和bbox的偏移量与label
    gt_rpn_label = at.totensor(gt_rpn_label).long()
    gt_rpn_loc = at.totensor(gt_rpn_loc)
    rpn_loc_loss = _fast_rcnn_loc_loss(
        rpn_loc,
        gt_rpn_loc,
        gt_rpn_label.data,
        self.rpn_sigma) #使用smooth_l1_loss
    #rpn_loc为rpn网络回归出来的偏移量(20000个)
    #gt_rpn_loc为anchor_target_creator函数得到2000个anchor与bbox的偏移量,rpn_sigma=1

    #rpn_score为rpn网络得到的20000个与anchor_target_creator得到的2000个label求交叉熵损失
    rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1)
    _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] # 在rpn不计算背景类
    rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
    self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long())

# ------------------ ROI losses (fast rcnn loss) -------------------#
    # roi_cls_loc为VGG16RoIHead的输出（128*84） n_sample=128
    n_sample = roi_cls_loc.shape[0]
    roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) # roi_cls_loc = (128, 21, 4)
    roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \
                          at.totensor(gt_roi_label).long()]
    gt_roi_label = at.totensor(gt_roi_label).long()
    gt_roi_loc = at.totensor(gt_roi_loc) #128个标签

    roi_loc_loss = _fast_rcnn_loc_loss(
        roi_loc.contiguous(),
        gt_roi_loc,
        gt_roi_label.data,
        self.roi_sigma) # 采用smooth_l1_loss

    roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())

    self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long())

    losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
    losses = losses + [sum(losses)]

    return LossTuple(*losses)

def _smooth_l1_loss(x, t, in_weight, sigma):
    sigma2 = sigma ** 2
    diff = in_weight * (x - t)
    abs_diff = diff.abs()
    flag = (abs_diff.data < (1. / sigma2)).float()
    y = (flag * (sigma2 / 2.) * (diff ** 2) +
         (1 - flag) * (abs_diff - 0.5 / sigma2))
    return y.sum()

def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma):
    # 输入分别为rpn回归框的偏移量与anchor与bbox的偏移量以及label
    in_weight = t.zeros(gt_loc.shape).cuda()
    in_weight[(gt_label > 0).view(-1, 1).expand_as(in_weight).cuda()] = 1
    loc_loss = _smooth_l1_loss(pred_loc, gt_loc, in_weight.detach(), sigma)
    # Normalize by total number of negtive and positive rois.
    loc_loss /= ((gt_label >= 0).sum().float())
     # ignore gt_label==-1 for rpn_loss 去除背景类
    return loc_loss


    

    


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Train.py

In [None]:
def train(**kwargs):
    opt._parse(kwargs)

    dataset = Dataset(opt)
    print('load data')
    dataloader = data_.DataLoader(dataset, \
                                  batch_size=1, \
                                  shuffle=True, \
                                  # pin_memory=True,
                                  num_workers=opt.num_workers)
    testset = TestDataset(opt)
    test_dataloader = data_.DataLoader(testset,
                                       batch_size=1,
                                       num_workers=opt.test_num_workers,
                                       shuffle=False, \
                                       pin_memory=True
                                       )
    faster_rcnn = FasterRCNNVGG16()
    print('model construct completed')
    # 设置trainer = FasterRCNNTrainer(faster_rcnn).cuda()
    # 将fasterRCNNVGG16作为fasterrcnn的模型送入到FasterRCNNTrainer中并设置好GPU加速
    trainer = FasterRCNNTrainer(faster_rcnn).cuda()
    if opt.load_path: #预训练模型
        trainer.load(opt.load_path)
        print('load pretrained model from %s' % opt.load_path)
    trainer.vis.text(dataset.db.label_names, win='labels')
    best_map = 0
    lr_ = opt.lr

    #用一个for循环开始训练过程 epoch为超参数
    for epoch in range(opt.epoch):
        trainer.reset_meters()
        for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):
            scale = at.scalar(scale)
            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
            # 从训练数据中枚举dataloader 设置好缩放范围，将img,bbox,label,scale全部设置为可GPU加速
            trainer.train_step(img, bbox, label, scale)
            #调用trainer.py中的函数trainer.train_step(img,bbox,label,scale)进行一次参数迭代优化过程

            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())

                # plot groud truth bboxes
                ori_img_ = inverse_normalize(at.tonumpy(img[0]))
                gt_img = visdom_bbox(ori_img_,
                                     at.tonumpy(bbox_[0]),
                                     at.tonumpy(label_[0]))
                trainer.vis.img('gt_img', gt_img)

                # plot predicti bboxes
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True)
                pred_img = visdom_bbox(ori_img_,
                                       at.tonumpy(_bboxes[0]),
                                       at.tonumpy(_labels[0]).reshape(-1),
                                       at.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)

                # rpn confusion matrix(meter)
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm')
                # roi confusion matrix
                trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float())
        eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num)
        trainer.vis.plot('test_map', eval_result['map'])
        lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr']
        log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_),
                                                  str(eval_result['map']),
                                                  str(trainer.get_meter_data()))
        trainer.vis.log(log_info)

        if eval_result['map'] > best_map:
            best_map = eval_result['map']
            best_path = trainer.save(best_map=best_map)
        if epoch == 9:
            trainer.load(best_path)
            trainer.faster_rcnn.scale_lr(opt.lr_decay)
            lr_ = lr_ * opt.lr_decay # 这里有一个lr_decay epoch到了9 lr就*0.1

        if epoch == 13: 
            break