In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import numpy as np
import torch
import torch.utils.data
from PIL import Image

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = '0, 1'

In [4]:
import torch.nn as nn

In [5]:
import torchvision
from torchvision.models.detection import FasterRCNN, MaskRCNN
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.models.detection.rpn import AnchorGenerator
# import torchvision.datasets as dset

sys.path.append('../maskrcnntools')
import transforms as T
from coco import CocoDetection
from engine import train_one_epoch, evaluate
import utils
from logger import Logger

In [6]:
sys.path.append('../lib/coco2voc')
from coco2voc_aux import annsToSeg

In [15]:
# path to COCO
COCO_DATASET = '../../../datasets/COCO/'
TRAIN_DIR = os.path.join(COCO_DATASET, 'train2017')
VAL_DIR = os.path.join(COCO_DATASET, 'val2017')
ANNOTATIONS_DIR = os.path.join(COCO_DATASET, 'annotations')
TRAIN_ANNOT = os.path.join(ANNOTATIONS_DIR, 'instances_train2017.json')
VAL_ANNOT = os.path.join(ANNOTATIONS_DIR, 'instances_val2017.json')

# path to logging dir
TB_LOGS_DIR = '../logs'

In [8]:
coco_train = CocoDetection(root=TRAIN_DIR,
                           annFile=TRAIN_ANNOT)
coco_val = CocoDetection(root=VAL_DIR,
                         annFile=VAL_ANNOT)

loading annotations into memory...
Done (t=13.99s)
creating index...
index created!
loading annotations into memory...
Done (t=0.42s)
creating index...
index created!


In [9]:
def get_transform(train):
    transforms = []
    # converts the image, a PIL image, into a PyTorch Tensor
    transforms.append(T.ToTensor())
    if train:
        # during training, randomly flip the training images
        # and ground-truth for data augmentation
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

In [10]:
class CocoDataset(object):
    def __init__(self, torch_dataset, transforms=None):
        self.torch_dataset = torch_dataset
        self.transforms = transforms
        
    def __getitem__(self, idx):
        img, target = self.torch_dataset[idx]
        
        masks = annsToSeg(target, coco_train.coco)[3]
        masks = masks.astype(bool)
        obj_ids = [x['category_id'] for x in target]
        
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])
            
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = [x['category_id'] for x in target]
        labels = torch.as_tensor(labels, dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)
        
        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        
        iscrowd = [x['iscrowd'] for x in target]
        iscrowd = torch.as_tensor(iscrowd, dtype=torch.int64)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target
    
    def __len__(self):
        return len(self.torch_dataset)

In [11]:
train_dataset = CocoDataset(coco_train, get_transform(train=True))
train_dataset[0]

(tensor([[[0.5137, 0.5255, 0.5255,  ..., 0.0000, 0.0039, 0.0078],
          [0.5137, 0.5294, 0.5373,  ..., 0.0039, 0.0118, 0.0196],
          [0.5176, 0.5333, 0.5412,  ..., 0.0000, 0.0000, 0.0039],
          ...,
          [0.0392, 0.0157, 0.0000,  ..., 0.0275, 0.0235, 0.0118],
          [0.0118, 0.0235, 0.0510,  ..., 0.0275, 0.0196, 0.0196],
          [0.0078, 0.0078, 0.0039,  ..., 0.0275, 0.0196, 0.0157]],
 
         [[0.6588, 0.6706, 0.6706,  ..., 0.0863, 0.0902, 0.0941],
          [0.6588, 0.6745, 0.6824,  ..., 0.0863, 0.0902, 0.0980],
          [0.6627, 0.6784, 0.6863,  ..., 0.0745, 0.0784, 0.0824],
          ...,
          [0.0157, 0.0392, 0.0431,  ..., 0.0039, 0.0039, 0.0039],
          [0.0078, 0.0078, 0.0314,  ..., 0.0118, 0.0000, 0.0000],
          [0.0196, 0.0118, 0.0118,  ..., 0.0118, 0.0000, 0.0000]],
 
         [[0.7647, 0.7725, 0.7725,  ..., 0.4235, 0.4275, 0.4275],
          [0.7608, 0.7765, 0.7843,  ..., 0.4353, 0.4392, 0.4392],
          [0.7647, 0.7804, 0.7882,  ...,

In [12]:
# load a pre-trained model for classification and return
# only the features
backbone = torchvision.models.mobilenet_v2(pretrained=True).features
# FasterRCNN needs to know the number of
# output channels in a backbone. For mobilenet_v2, it's 1280
# so we need to add it here
backbone.out_channels = 1280

# let's make the RPN generate 5 x 3 anchors per spatial
# location, with 5 different sizes and 3 different aspect
# ratios. We have a Tuple[Tuple[int]] because each feature
# map could potentially have different sizes and
# aspect ratios
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                   aspect_ratios=((0.5, 1.0, 2.0),))

# let's define what are the feature maps that we will
# use to perform the region of interest cropping, as well as
# the size of the crop after rescaling.
# if your backbone returns a Tensor, featmap_names is expected to
# be [0]. More generally, the backbone should return an
# OrderedDict[Tensor], and in featmap_names you can choose which
# feature maps to use.
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],
                                                output_size=7,
                                                sampling_ratio=2)

mask_roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],
                                                     output_size=14,
                                                     sampling_ratio=2)
# put the pieces together inside a FasterRCNN model
model = MaskRCNN(backbone,
                 num_classes=172,
                 rpn_anchor_generator=anchor_generator,
                 box_roi_pool=roi_pooler,
                 mask_roi_pool=mask_roi_pooler)

In [13]:
def get_instance_segmentation_model(num_classes):
    # load an instance segmentation model pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

    # get the number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,
                                                       num_classes)

    return model

In [14]:
dataset_train = CocoDataset(coco_train, get_transform(train=True))
dataset_val = CocoDataset(coco_val, get_transform(train=False))

data_loader_train = torch.utils.data.DataLoader(
    dataset_train, batch_size=2, shuffle=True, num_workers=4,
    collate_fn=utils.collate_fn)

data_loader_val = torch.utils.data.DataLoader(
    dataset_val, batch_size=1, shuffle=False, num_workers=4,
    collate_fn=utils.collate_fn)

In [16]:
num_classes = 172

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = get_instance_segmentation_model(num_classes)
# model = nn.DataParallel(model)
model.to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.00001,
                            momentum=0.9, weight_decay=0.0005)

logger = Logger(TB_LOGS_DIR)

In [17]:
from multiprocessing import Process
import time

def start_tensorboard(command):
    time.sleep(10)
    os.system(command)

In [19]:
print('Monitor here:')
print('\ttensorboard --logdir="{}"'.format(TB_LOGS_DIR))

command = 'tensorboard --logdir="{}"'.format(TB_LOGS_DIR)
p = Process(target=start_tensorboard, args=[command,])
p.start()

Monitor here:
	tensorboard --logdir="../logs"


In [22]:
# let's train it for 10 epochs
num_epochs = 10

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, data_loader_train, device, epoch, print_freq=10, lr_schedule='cyclic', logger=logger)
    # evaluate on the test dataset
    evaluate(model, data_loader_val, device=device)

Epoch: [0]  [    0/58633]  eta: 2 days, 14:43:06  lr: 0.000001  loss: 6.7490 (6.7490)  loss_classifier: 5.0637 (5.0637)  loss_box_reg: 0.0628 (0.0628)  loss_mask: 0.8354 (0.8354)  loss_objectness: 0.6870 (0.6870)  loss_rpn_box_reg: 0.1002 (0.1002)  time: 3.8508  data: 0.3461  max mem: 4234
Epoch: [0]  [   10/58633]  eta: 10:41:27  lr: 0.000004  loss: 6.9725 (7.0580)  loss_classifier: 5.0576 (5.0527)  loss_box_reg: 0.0628 (0.0820)  loss_mask: 0.9359 (0.9592)  loss_objectness: 0.7010 (0.7032)  loss_rpn_box_reg: 0.1602 (0.2608)  time: 0.6565  data: 0.0390  max mem: 5239
Epoch: [0]  [   20/58633]  eta: 8:07:10  lr: 0.000006  loss: 6.9725 (6.9916)  loss_classifier: 5.0272 (5.0252)  loss_box_reg: 0.0617 (0.0866)  loss_mask: 0.9553 (0.9826)  loss_objectness: 0.6956 (0.6985)  loss_rpn_box_reg: 0.1325 (0.1988)  time: 0.3311  data: 0.0072  max mem: 5239
Epoch: [0]  [   30/58633]  eta: 7:08:59  lr: 0.000009  loss: 6.8829 (6.9374)  loss_classifier: 4.9336 (4.9750)  loss_box_reg: 0.0626 (0.0809)  l

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
