In [25]:
import os
import torch

import detectron2.utils.comm as comm
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
from detectron2.evaluation import (
    CityscapesEvaluator,
    COCOEvaluator,
    DatasetEvaluators,
    LVISEvaluator,
    verify_results,
)
from detectron2.data.datasets.register_coco import register_coco_instances 
from point_rend import add_pointrend_config
from detectron2 import model_zoo

`register_coco_instances`将`coco`格式的数据集注册

In [None]:
register_coco_instances('self_train', {},
            '/dev/COCO/annotations_trainval2017/annotations/instances_train2017.json',
            '/dev/COCO/train2017')

register_coco_instances('self_val', {},
            '/dev/COCO/annotations_trainval2017/annotations/instances_val2017.json',
            '/dev/COCO/val2017')

In [12]:
coco_train_metadata = MetadataCatalog.get('self_train')
coco_val_metadata = MetadataCatalog.get('self_val')
dataset_dicts = DatasetCatalog.get('self_train')


Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.



`cfg`是`CfgNode`类的一个对象，而`CfgNode`类是`from fvcore.common.config import CfgNode as _CfgNode`中`_CfgNode`的继承  

In [13]:
cfg = get_cfg() 

将`pointrend`需要的配置加到了`cfg`对象中

In [14]:
add_pointrend_config(cfg)

In [18]:
pwd

'/home/weiweia92/detectron2_repo/projects/PointRend'

In [46]:
cfg.merge_from_file('configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco.yaml') # relative path

为了理解`cfg`对象的`merge_from_file`的意义，看https://github.com/facebookresearch/detectron2/blob/master/detectron2/config/config.py 部分源代码  
```
from fvcore.common.config import CfgNode as _CfgNode

class CfgNode(_CfgNode):
    ...
    def merge_from_file(self, cfg_filename: str, allow_unsafe: bool = True) -> None:
        assert PathManager.isfile(cfg_filename), f"Config file '{cfg_filename}' does not exist!"
        loaded_cfg = _CfgNode.load_yaml_with_base(cfg_filename, allow_unsafe=allow_unsafe)
        loaded_cfg = type(self)(loaded_cfg)
    ...

import yaml
from yacs.config import CfgNode as _CfgNode

class CfgNode(_CfgNode):
    # file_name 为yaml文件所放地址
    @staticmethod
    def load_yaml_with_base(filename: str, allow_unsafe: bool = False) -> None:
        with PathManager.open(filename, "r") as f:
            try:
                cfg = yaml.safe_load(f)
    ...
```


In [48]:
cat /home/weiweia92/yaml_example.yaml

name: junxi
age: 18
spouse:
    name: Rui
    age: 18
children:
    - name: Chen You
      age: 3
    - name: Ruo Xi
      age: 2

`yaml`经常被用来作为配置文件，结构通过空格缩进来展示。列表里的项用"-"来代表，字典里的键值对用":"分隔.

In [50]:
import yaml
f = open('/home/weiweia92/yaml_example.yaml')
content = yaml.safe_load(f)
print(type(content))
print(content)

<class 'dict'>
{'name': 'junxi', 'age': 18, 'spouse': {'name': 'Rui', 'age': 18}, 'children': [{'name': 'Chen You', 'age': 3}, {'name': 'Ruo Xi', 'age': 2}]}


In [30]:
cfg.DATASETS.TRAIN = ('self_train',)

In [32]:
cfg.DATASETS.TEST = ()

In [33]:
cfg.DATALOADER.NUM_WORKERS = 8  # 8process??

In [39]:
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml")  # Let training initialize from model zoo
cfg.SOLVER.IMS_PER_BATCH = 2
cfg.SOLVER.BASE_LR = 0.00025  # pick a good LR
cfg.SOLVER.MAX_ITER = 300    # 300 iterations seems good enough for this toy dataset; you may need to train longer for a practical dataset
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128   # faster, and good enough for this toy dataset (default: 512)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 80

In [41]:
class Trainer(DefaultTrainer):
    """
    We use the "DefaultTrainer" which contains a number pre-defined logic for
    standard training workflow. They may not work for you, especially if you
    are working on a new research project. In that case you can use the cleaner
    "SimpleTrainer", or write your own training loop.
    """

    @classmethod
    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
        """
        Create evaluator(s) for a given dataset.
        This uses the special metadata "evaluator_type" associated with each builtin dataset.
        For your own dataset, you can simply create an evaluator manually in your
        script and do not have to worry about the hacky if-else logic here.
        """
        if output_folder is None:
            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
        evaluator_list = []
        evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
        if evaluator_type == "lvis":
            return LVISEvaluator(dataset_name, cfg, True, output_folder)
        if evaluator_type == "coco":
            return COCOEvaluator(dataset_name, cfg, True, output_folder)
        if evaluator_type == "cityscapes":
            assert (
                torch.cuda.device_count() >= comm.get_rank()
            ), "CityscapesEvaluator currently do not work with multiple machines."
            return CityscapesEvaluator(dataset_name)
        if len(evaluator_list) == 0:
            raise NotImplementedError(
                "no Evaluator for the dataset {} with the type {}".format(
                    dataset_name, evaluator_type
                )
            )
        if len(evaluator_list) == 1:
            return evaluator_list[0]
        return DatasetEvaluators(evaluator_list)

In [52]:
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = Trainer(cfg) 
trainer.resume_or_load(resume=False)
trainer.train()

[32m[04/16 13:47:29 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
 

[32m[04/16 13:47:53 d2.data.datasets.coco]: [0mLoading datasets/coco/annotations/instances_train2017.json takes 23.98 seconds.
[32m[04/16 13:47:53 d2.data.datasets.coco]: [0mLoaded 118287 images in COCO format from datasets/coco/annotations/instances_train2017.json
[32m[04/16 13:48:00 d2.data.build]: [0mRemoved 1021 images with no usable annotations. 117266 images left.
[32m[04/16 13:48:02 d2.data.common]: [0mSerializing 117266 elements to byte tensors and concatenating them all ...
[32m[04/16 13:48:05 d2.data.common]: [0mSerialized dataset takes 457.92 MiB
[32m[04/16 13:48:05 d2.data.detection_utils]: [0mTransformGens used in training: [ResizeShortestEdge(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style='choice'), RandomFlip()]
[32m[04/16 13:48:05 d2.data.build]: [0mUsing training sampler TrainingSampler
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mRemapping C2 weights ......
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: 

[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res2.2.conv2.norm.running_mean         loaded from res2_2_branch2b_bn_running_mean   of shape (64,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res2.2.conv2.norm.running_var          loaded from res2_2_branch2b_bn_running_var    of shape (64,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res2.2.conv2.norm.weight               loaded from res2_2_branch2b_bn_gamma          of shape (64,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res2.2.conv2.weight                    loaded from res2_2_branch2b_w                 of shape (64, 64, 3, 3)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res2.2.conv3.norm.bias                 loaded from res2_2_branch2c_bn_beta           of shape (256,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res2.2.conv3.norm.running_me

[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res3.2.conv1.norm.running_var          loaded from res3_2_branch2a_bn_running_var    of shape (128,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res3.2.conv1.norm.weight               loaded from res3_2_branch2a_bn_gamma          of shape (128,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res3.2.conv1.weight                    loaded from res3_2_branch2a_w                 of shape (128, 512, 1, 1)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res3.2.conv2.norm.bias                 loaded from res3_2_branch2b_bn_beta           of shape (128,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res3.2.conv2.norm.running_mean         loaded from res3_2_branch2b_bn_running_mean   of shape (128,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res3.2.conv2.norm.runni

[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res4.0.shortcut.norm.weight            loaded from res4_0_branch1_bn_gamma           of shape (1024,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res4.0.shortcut.weight                 loaded from res4_0_branch1_w                  of shape (1024, 512, 1, 1)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res4.1.conv1.norm.bias                 loaded from res4_1_branch2a_bn_beta           of shape (256,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res4.1.conv1.norm.running_mean         loaded from res4_1_branch2a_bn_running_mean   of shape (256,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res4.1.conv1.norm.running_var          loaded from res4_1_branch2a_bn_running_var    of shape (256,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res4.1.conv1.norm.wei

[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res4.3.conv3.weight                    loaded from res4_3_branch2c_w                 of shape (1024, 256, 1, 1)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res4.4.conv1.norm.bias                 loaded from res4_4_branch2a_bn_beta           of shape (256,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res4.4.conv1.norm.running_mean         loaded from res4_4_branch2a_bn_running_mean   of shape (256,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res4.4.conv1.norm.running_var          loaded from res4_4_branch2a_bn_running_var    of shape (256,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res4.4.conv1.norm.weight               loaded from res4_4_branch2a_bn_gamma          of shape (256,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res4.4.conv1.weight   

[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res5.0.shortcut.norm.bias              loaded from res5_0_branch1_bn_beta            of shape (2048,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res5.0.shortcut.norm.running_mean      loaded from res5_0_branch1_bn_running_mean    of shape (2048,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res5.0.shortcut.norm.running_var       loaded from res5_0_branch1_bn_running_var     of shape (2048,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res5.0.shortcut.norm.weight            loaded from res5_0_branch1_bn_gamma           of shape (2048,)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res5.0.shortcut.weight                 loaded from res5_0_branch1_w                  of shape (2048, 1024, 1, 1)
[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mbackbone.bottom_up.res5.1.conv1.norm

[32m[04/16 13:48:08 d2.checkpoint.c2_model_loading]: [0mThe checkpoint contains parameters not used by the model:
  [35mfc1000_b[0m
  [35mfc1000_w[0m
  [35mconv1_b[0m
[32m[04/16 13:48:08 d2.engine.train_loop]: [0mStarting training from iteration 0
[4m[5m[31mERROR[0m [32m[04/16 13:48:09 d2.engine.train_loop]: [0mException during training:
Traceback (most recent call last):
  File "/home/weiweia92/detectron2_repo/detectron2/engine/train_loop.py", line 132, in train
    self.run_step()
  File "/home/weiweia92/detectron2_repo/detectron2/engine/train_loop.py", line 215, in run_step
    loss_dict = self.model(data)
  File "/home/weiweia92/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/weiweia92/detectron2_repo/detectron2/modeling/meta_arch/rcnn.py", line 121, in forward
    features = self.backbone(images.tensor)
  File "/home/weiweia92/anaconda3/lib/python3.7/site-packages/tor

RuntimeError: CUDA out of memory. Tried to allocate 250.00 MiB (GPU 0; 10.73 GiB total capacity; 8.51 GiB already allocated; 71.62 MiB free; 8.96 GiB reserved in total by PyTorch)