# Train the Light-Head Mask-RCNN on the Coco Dataset
This notebook provides a fast way to start training on Coco dataset based on Light-Head Mask-RCNN.

In [1]:
"""
Based on the work of Waleed Abdulla (Matterport)
written by wozhouh
"""

# Import Python Packages
import os
import imgaug

# Root directory of the project
ROOT_DIR = os.path.abspath("../../../")
import sys

# Import Mask RCNN
sys.path.append(ROOT_DIR)
from mrcnn import model as modellib

# Import COCO config
sys.path.append(os.path.join(ROOT_DIR, "samples", "coco"))  # To find local version
import coco

# # assign the GPU for training
# os.environ["CUDA_VISIBLE_DEVICES"] = "4"

Using TensorFlow backend.


In [2]:
# Default setting
HOME_DIR = os.getenv('HOME')
DEFAULT_WEIGHTS_DIR = os.path.join(ROOT_DIR, "weights")
DEFAULT_LOGS_DIR = os.path.join(ROOT_DIR, "logs")
DEFAULT_DATASET_YEAR = "2017"
DEFAULT_COCO_PATH = os.path.join(HOME_DIR, "data", "Coco")

# weights to load
# MODEL_PATH_UNDER_HOME = os.path.join(DEFAULT_WEIGHTS_DIR, "ResNet-101", "mask_rcnn_coco.h5")
MODEL_PATH_UNDER_HOME = os.path.join(DEFAULT_LOGS_DIR, "Light-Head", "training20181211T1752", "mask_rcnn_training_0213.h5")
INIT_MODEL_PATH = os.path.join(HOME_DIR, MODEL_PATH_UNDER_HOME)
LOG_DIR = os.path.join(DEFAULT_LOGS_DIR, "Light-Head")

In [6]:
# training config
class TrainingConfig(coco.CocoConfig):
    NAME = "training"
    
    # GPU
    # Set batch size to 1 since we'll be running inference on
    # one image at a time. Batch size = GPU_COUNT * IMAGES_PER_GPU
    GPU_COUNT = 2
    IMAGES_PER_GPU = 2
    
    # data
    IMAGE_MIN_DIM = 800
    IMAGE_MAX_DIM = 1024
    RPN_ANCHOR_SCALES = (32, 64, 128, 256, 512)
    
    # model
    BACKBONE = "resnet101"
    BACKBONE_STRIDES = [4, 8, 16, 32, 64]
    
    # heads
    TOP_DOWN_PYRAMID_SIZE = 256
    DETECTION_HEAD = "light-head" 
    MASK_HEAD = "original"
    FPN_CLASSIF_FC_LAYERS_SIZE = 2048
    RPN_TRAIN_ANCHORS_PER_IMAGE = 256
    LARGE_SEPARABLE_KERNEL_SIZE = 15
    LARGE_SEPARABLE_CHANNELS_MID = 256
    LARGE_SEPARABLE_CHANNELS_OUT = 490
    
    # training
    LEARNING_RATE = 0.001
    WEIGHT_DECAY = 0.0001
#     LOSS_WEIGHTS = {'rpn_class_loss': 1.0, 'rpn_bbox_loss': 1.0, 'mrcnn_class_loss': 1.0, 'mrcnn_bbox_loss': 1.0, 'mrcnn_mask_loss': 1.0}
    LOSS_WEIGHTS = {'rpn_class_loss': 0.0, 'rpn_bbox_loss': 0.0, 'mrcnn_class_loss': 1.0, 'mrcnn_bbox_loss': 1.5, 'mrcnn_mask_loss': 0.0}
    STEPS_PER_EPOCH = 1000
    VALIDATION_STEPS = 50
    TRAIN_BN = False
    TRAIN_ROIS_PER_IMAGE = 200
    ROI_POSITIVE_RATIO = 0.33
    
config = TrainingConfig()
config.display()
model = modellib.MaskRCNN(mode="training", model_dir=LOG_DIR, config=config)


Configurations:
BACKBONE                       resnet101
BACKBONE_STRIDES               [4, 8, 16, 32, 64]
BATCH_SIZE                     4
BBOX_STD_DEV                   [0.1 0.1 0.2 0.2]
COMPUTE_BACKBONE_SHAPE         None
DETECTION_HEAD                 light-head
DETECTION_MAX_INSTANCES        100
DETECTION_MIN_CONFIDENCE       0.7
DETECTION_NMS_THRESHOLD        0.3
FPN_CLASSIF_FC_LAYERS_SIZE     2048
GPU_COUNT                      2
GRADIENT_CLIP_NORM             5.0
IMAGES_PER_GPU                 2
IMAGE_CHANNEL_COUNT            3
IMAGE_MAX_DIM                  1024
IMAGE_META_SIZE                93
IMAGE_MIN_DIM                  800
IMAGE_MIN_SCALE                0
IMAGE_RESIZE_MODE              square
IMAGE_SHAPE                    [1024 1024    3]
LARGE_SEPARABLE_CHANNELS_MID   256
LARGE_SEPARABLE_CHANNELS_OUT   490
LARGE_SEPARABLE_KERNEL_SIZE    15
LEARNING_MOMENTUM              0.9
LEARNING_RATE                  0.001
LOSS_WEIGHTS                   {'rpn_class_loss': 0.0, 'r

In [4]:
# print the model summary
model.keras_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_image (InputLayer)        (None, None, None, 3 0                                            
__________________________________________________________________________________________________
input_image_meta (InputLayer)   (None, 93)           0                                            
__________________________________________________________________________________________________
input_rpn_match (InputLayer)    (None, None, 1)      0                                            
__________________________________________________________________________________________________
input_rpn_bbox (InputLayer)     (None, None, 4)      0                                            
__________________________________________________________________________________________________
input_gt_c

In [7]:
# Load the weights
model.load_weights(INIT_MODEL_PATH, by_name=True)

Re-starting from epoch 213


In [8]:
# Load the dataset
train_dataset = coco.CocoDataset()
train_dataset.load_coco(DEFAULT_COCO_PATH, "train", year=DEFAULT_DATASET_YEAR)
train_dataset.prepare()
val_dataset = coco.CocoDataset()
val_dataset.load_coco(DEFAULT_COCO_PATH, "val", year=DEFAULT_DATASET_YEAR)
val_dataset.prepare()

# Image Augmentation
# Right/Left flip 50% of the time
augmentation = imgaug.augmenters.Fliplr(0.5)

loading annotations into memory...
Done (t=13.24s)
creating index...
index created!
loading annotations into memory...
Done (t=4.50s)
creating index...
index created!


In [None]:
# # # layers: "heads", "5+", "4+", "3+", "all", "light-head"
model.train(train_dataset=train_dataset,
                val_dataset=val_dataset,
                learning_rate=config.LEARNING_RATE / 50.0,
                epochs=302,
                layers='light-head-detection',
                augmentation=augmentation)


Starting at epoch 213. LR=2e-05

Checkpoint Path: /home/processyuan/code/HumanMask/my-Mobile-Mask-RCNN/logs/Light-Head/training20181211T1752/mask_rcnn_training_{epoch:04d}.h5
Selecting layers to train
In model:  rpn_model
In model:  large_separable_conv
    light_head_large_separable_conv_0a   (Conv2D)
    light_head_large_separable_conv_1a   (Conv2D)
    light_head_large_separable_conv_0b   (Conv2D)
    light_head_large_separable_conv_1b   (Conv2D)
    light_head_large_separable_bn   (BatchNorm)
light_head_class_conv   (TimeDistributed)
light_head_class_bn    (TimeDistributed)
light_head_bbox_fc     (TimeDistributed)
light_head_class_logits   (TimeDistributed)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 214/302
  72/1000 [=>............................] - ETA: 31:09 - loss: 0.5335 - rpn_class_loss: 0.0000e+00 - rpn_bbox_loss: 0.0000e+00 - mrcnn_class_loss: 0.1981 - mrcnn_bbox_loss: 0.3354 - mrcnn_mask_loss: 0.0000e+00

In [None]:
# Refer to the trained ResNet-101-based Mask-RCNN, loss should be 
# Epoch 1/30
#  28/1000 [..............................] - ETA: 1:37:39 
# - loss: 0.6337 
# - rpn_class_loss: 0.0112 
# - rpn_bbox_loss: 0.1044 
# - mrcnn_class_loss: 0.2009 
# - mrcnn_bbox_loss: 0.0904 
# - mrcnn_mask_loss: 0.2268