
# Assignment 3

This is a template notebook for Assignment 3.


## Install dependencies and initialization

In [1]:
# import some common libraries
# from google.colab.patches import cv2_imshow
from sklearn.metrics import jaccard_score
from PIL import Image, ImageDraw
from tqdm import tqdm
import pandas as pd
import numpy as np
import datetime
import random
import json
import cv2
import csv
import os

# import some common pytorch utilities
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.nn.functional as F
import torch.nn as nn
import torch

# import some common detectron2 utilities
import detectron2
from detectron2 import model_zoo
from detectron2.config import get_cfg
from detectron2.structures import BoxMode
from detectron2.engine import DefaultTrainer
from detectron2.engine import DefaultPredictor
from detectron2.utils.logger import setup_logger
from detectron2.utils.visualizer import ColorMode
from detectron2.utils.visualizer import Visualizer
from detectron2.data import build_detection_test_loader
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
setup_logger()

<Logger detectron2 (DEBUG)>

In [2]:
# Make sure that GPU is available for your notebook.
# Otherwise, you need to update the settungs in Runtime -> Change runtime type -> Hardware accelerator
torch.cuda.is_available()

True

In [3]:
# Define the location of current directory, which should contain data/train, data/test, and data/train.json.
# TODO: approx 1 line
BASE_DIR = '.'
OUTPUT_DIR = '{}/output'.format(BASE_DIR)
os.makedirs(OUTPUT_DIR, exist_ok=True)

## Part 1: Object Detection

### Data Loader

In [4]:
'''
# This function should return a list of data samples in which each sample is a dictionary.
# Make sure to select the correct bbox_mode for the data
# For the test data, you only have access to the images, therefore, the annotations should be empty.
# Other values could be obtained from the image files.
# TODO: approx 35 lines
'''
VAL_RATE = 0.2 # Precentage of the validate size
def get_detection_data(set_name, datapath="data/train.json"):
    data_dirs = '{}/data'.format(BASE_DIR)
    # return test_set, no annotations
    if set_name == "test":
        test_set = []
        for fname in os.listdir(os.path.join(data_dirs, "test")):
            if os.path.splitext(fname)[1] == ".png":
                path = os.path.join(data_dirs, "test", fname)
                width, height = Image.open(path).size
                test_set.append({
                    "file_name": path,
                    "image_id": os.path.splitext(fname)[0],
                    "height": height,
                    "width": width,
                    "annotations": []
                })
        return test_set
    # return validate_set or train_set, with annotations
    with open(datapath) as f:
        data = json.load(f)
    validate_size = int(len(data)*VAL_RATE)
    train_annotations, validate_annotations = data[0:len(data)-validate_size], data[len(data)-validate_size:]
    annotations = validate_annotations if set_name == "val" else (data if set_name == "all" else train_annotations)
    # return validate_set or train_set, with annotations
    datadict = {}
    for annotation in annotations:
        path = os.path.join(data_dirs, "train", annotation["file_name"])
        anno = {
            "bbox": annotation["bbox"],
            "bbox_mode": BoxMode.XYWH_ABS,
            "segmentation": annotation["segmentation"],
            "category_id": annotation["category_id"],
            "iscrowd": annotation["iscrowd"],
            "area": annotation["area"]
        }
        if path in datadict:
            datadict[path]["annotations"].append(anno)
            continue
        width, height = Image.open(path).size
        datadict[path] = {
            "image_id": annotation["image_id"],
            "height": height,
            "width": width,
            "annotations": [{
                "bbox": annotation["bbox"],
                "bbox_mode": BoxMode.XYWH_ABS,
                "segmentation": annotation["segmentation"],
                "category_id": annotation["category_id"],
                "iscrowd": annotation["iscrowd"],
                "area": annotation["area"]
            }]
        }
    return [{"file_name": path, **data} for path, data in datadict.items()]

In [5]:
'''
# Remember to add your dataset to DatasetCatalog and MetadataCatalog
# Consdier "data_detection_train" and "data_detection_test" for registration
# You can also add an optional "data_detection_val" for your validation by spliting the training data
# TODO: approx 5 lines
'''
for i in ["train", "val", "all", "test"]:
    DatasetCatalog.register("data_detection_{}".format(i), lambda i=i: get_detection_data(i))
    MetadataCatalog.get("data_detection_{}".format(i)).set(thing_classes=["not plane 1", "not plane 2", "not plane 3", "not plane 4", "plane"])

DatasetCatalog.register("data_detection_all_ori", lambda i=i: get_detection_data("all", datapath="data/train.json"))
MetadataCatalog.get("data_detection_all_ori").set(thing_classes=["not plane 1", "not plane 2", "not plane 3", "not plane 4", "plane"])

namespace(name='data_detection_all_ori',
          thing_classes=['not plane 1',
                         'not plane 2',
                         'not plane 3',
                         'not plane 4',
                         'plane'])

In [6]:
'''
# Visualize some samples using Visualizer to make sure that the function works correctly
# TODO: approx 5 lines
'''
train_set = get_detection_data("train")
data = train_set[random.randrange(0, len(train_set))]
img = cv2.imread(data["file_name"])
visualizer = Visualizer(img[:, :, ::-1], metadata=MetadataCatalog.get("data_detection_train"), scale=0.5)
out = visualizer.draw_dataset_dict(data)
save_path = os.path.join(BASE_DIR, "output", "train_set.jpg")
cv2.imwrite(save_path, out.get_image()[:, :, ::-1])

True

### Set Configs

In [7]:
'''
# Set the configs for the detection part in here.
# TODO: approx 15 lines
'''
cfg = get_cfg()
# model settings
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml"))
cfg.OUTPUT_DIR = os.path.join(BASE_DIR, 'output')
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
# training settings
cfg.DATASETS.TRAIN = ("data_detection_train",)
cfg.DATASETS.TEST = ()
cfg.SOLVER.MAX_ITER = 1200
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
cfg.SOLVER.IMS_PER_BATCH = 2
cfg.SOLVER.BASE_LR = 0.00025
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml") # pretrain model

### Training

In [8]:
'''
# Create a DefaultTrainer using the above config and train the model
# TODO: approx 5 lines
'''
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

[11/11 05:23:16 d2.engine.defaults]: Model:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
      (res

[11/11 05:23:17 d2.data.build]: Removed 0 images with no usable annotations. 159 images left.
[11/11 05:23:17 d2.data.build]: Distribution of instances among all 5 categories:
|  category   | #instances   |  category   | #instances   |  category   | #instances   |
|:-----------:|:-------------|:-----------:|:-------------|:-----------:|:-------------|
| not plane 1 | 0            | not plane 2 | 0            | not plane 3 | 0            |
| not plane 4 | 0            |    plane    | 6384         |             |              |
|    total    | 6384         |             |              |             |              |
[11/11 05:23:17 d2.data.dataset_mapper]: [DatasetMapper] Augmentations used in training: [ResizeShortestEdge(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style='choice'), RandomFlip()]
[11/11 05:23:17 d2.data.build]: Using training sampler TrainingSampler
[11/11 05:23:17 d2.data.common]: Serializing 159 elements to byte tensors and concatenating them

  max_size = (max_size + (stride - 1)) // stride * stride
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


[11/11 05:23:38 d2.utils.events]:  eta: 0:05:26  iter: 19  total_loss: 3.688  loss_cls: 1.051  loss_box_reg: 0.2366  loss_rpn_cls: 1.712  loss_rpn_loc: 0.5286  time: 0.9299  data_time: 0.7127  lr: 4.9953e-06  max_mem: 3743M
[11/11 05:24:07 d2.utils.events]:  eta: 0:08:17  iter: 39  total_loss: 2.047  loss_cls: 0.5626  loss_box_reg: 0.1987  loss_rpn_cls: 0.6582  loss_rpn_loc: 0.3284  time: 1.1856  data_time: 1.0675  lr: 9.9902e-06  max_mem: 4730M
[11/11 05:24:32 d2.utils.events]:  eta: 0:05:49  iter: 59  total_loss: 1.413  loss_cls: 0.5444  loss_box_reg: 0.2317  loss_rpn_cls: 0.1745  loss_rpn_loc: 0.2207  time: 1.2062  data_time: 0.9572  lr: 1.4985e-05  max_mem: 4730M
[11/11 05:24:53 d2.utils.events]:  eta: 0:06:01  iter: 79  total_loss: 1.709  loss_cls: 0.4936  loss_box_reg: 0.3135  loss_rpn_cls: 0.2666  loss_rpn_loc: 0.3798  time: 1.1735  data_time: 0.7909  lr: 1.998e-05  max_mem: 4730M
[11/11 05:25:11 d2.utils.events]:  eta: 0:05:37  iter: 99  total_loss: 1.223  loss_cls: 0.2307  los

[11/11 05:35:59 d2.utils.events]:  eta: 0:02:10  iter: 759  total_loss: 0.7079  loss_cls: 0.1513  loss_box_reg: 0.253  loss_rpn_cls: 0.09299  loss_rpn_loc: 0.1706  time: 0.9979  data_time: 0.7713  lr: 0.00018981  max_mem: 6923M
[11/11 05:36:15 d2.utils.events]:  eta: 0:02:03  iter: 779  total_loss: 0.6337  loss_cls: 0.1219  loss_box_reg: 0.2003  loss_rpn_cls: 0.09463  loss_rpn_loc: 0.1905  time: 0.9923  data_time: 0.4930  lr: 0.00019481  max_mem: 6923M
[11/11 05:36:36 d2.utils.events]:  eta: 0:01:58  iter: 799  total_loss: 0.7787  loss_cls: 0.172  loss_box_reg: 0.3014  loss_rpn_cls: 0.1188  loss_rpn_loc: 0.217  time: 0.9942  data_time: 0.7170  lr: 0.0001998  max_mem: 6923M
[11/11 05:36:51 d2.utils.events]:  eta: 0:01:52  iter: 819  total_loss: 0.676  loss_cls: 0.119  loss_box_reg: 0.2112  loss_rpn_cls: 0.07871  loss_rpn_loc: 0.1703  time: 0.9885  data_time: 0.5109  lr: 0.0002048  max_mem: 6923M
[11/11 05:37:08 d2.utils.events]:  eta: 0:01:46  iter: 839  total_loss: 0.6147  loss_cls: 0.

### Evaluation and Visualization

In [9]:
'''
# After training the model, you need to update cfg.MODEL.WEIGHTS
# Define a DefaultPredictor
'''
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
# cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml") # pretrain model
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.6
predictor = DefaultPredictor(cfg)

In [10]:
'''
# Visualize the output for 3 random test samples
# TODO: approx 10 lines
'''
test_set = get_detection_data("test")
for idx in [18, 16, 37]:
    data = test_set[idx]
    img = cv2.imread(data["file_name"])
    result = predictor(img)
    visualizer = Visualizer(img[:, :, ::-1], metadata=MetadataCatalog.get("data_detection_train"), scale=0.5, instance_mode=ColorMode.IMAGE_BW)
    result = visualizer.draw_instance_predictions(result["instances"].to("cpu"))
    img = result.get_image()[:, :, ::-1]
    cv2.imwrite(os.path.join(cfg.OUTPUT_DIR, f"test_set_{idx}.jpg"), img)

In [11]:
'''
# Use COCOEvaluator and build_detection_train_loader
# You can save the output predictions using inference_on_dataset
# TODO: approx 5 lines
'''
evaluator = COCOEvaluator("data_detection_val", output_dir=cfg.OUTPUT_DIR)
test_loader = build_detection_test_loader(cfg, "data_detection_val")
print(inference_on_dataset(predictor.model, test_loader, evaluator))

[11/11 05:45:57 d2.evaluation.coco_evaluation]: Trying to convert 'data_detection_val' to COCO format ...
[11/11 05:45:59 d2.data.build]: Distribution of instances among all 5 categories:
|  category   | #instances   |  category   | #instances   |  category   | #instances   |
|:-----------:|:-------------|:-----------:|:-------------|:-----------:|:-------------|
| not plane 1 | 0            | not plane 2 | 0            | not plane 3 | 0            |
| not plane 4 | 0            |    plane    | 1596         |             |              |
|    total    | 1596         |             |              |             |              |
[11/11 05:45:59 d2.data.dataset_mapper]: [DatasetMapper] Augmentations used in inference: [ResizeShortestEdge(short_edge_length=(800, 800), max_size=1333, sample_style='choice')]
[11/11 05:45:59 d2.data.common]: Serializing 40 elements to byte tensors and concatenating them all ...
[11/11 05:45:59 d2.data.common]: Serialized dataset takes 2.21 MiB
[11/11 05:45:59 d

### Improvements

For this part, you can bring any improvement which you have by adding new input parameters to the previous functions or defining new functions and variables.

In [12]:
'''
# Bring any changes and updates regarding the improvement in here
'''
print("Modification done above")

Modification done above


## Part 2: Semantic Segmentation

### Data Loader

In [13]:
'''
# Write a function that returns the cropped image and corresponding mask regarding the target bounding box
# idx is the index of the target bbox in the data
# high-resolution image could be passed or could be load from data['file_name']
# You can use the mask attribute of detectron2.utils.visualizer.GenericMask
#     to convert the segmentation annotations to binary masks
# TODO: approx 10 lines
'''
from detectron2.utils.visualizer import GenericMask
cache_dir = os.path.join(BASE_DIR, "data", "cache-new")
os.makedirs(cache_dir, exist_ok=True)
big_cache = {}
queue = []
def get_instance_sample(data, idx, img=None):
    height, width = data['height'], data['width']
    bbox = data['annotations'][idx]['bbox']
    x1, y1 = int(bbox[0]), int(bbox[1])
    x2, y2 = x1 + int(bbox[2]), y1 + int(bbox[3])
    cache_path = os.path.join(cache_dir, os.path.basename(data['file_name']) + f"-{idx}.png")
    if not os.path.isfile(cache_path):
        if data['file_name'] not in big_cache:
            if len(big_cache) >= 2:
                big_cache[queue.pop()] = None
            big_cache[data['file_name']] = cv2.imread(data['file_name'])
            queue.append(data['file_name'])
        cv2.imwrite(cache_path, big_cache[data['file_name']][y1:y2,x1:x2,:])
    obj_img = cv2.imread(cache_path)
    obj_mask = np.zeros((int(bbox[3]), int(bbox[2])))
    if len(data['annotations'][idx]['segmentation']) > 0 and sum(len(a) for a in data['annotations'][idx]['segmentation']) > 0:
        obj_mask = GenericMask(data['annotations'][idx]['segmentation'], height, width).mask[y1:y2,x1:x2]
    obj_img = cv2.resize(obj_img, (128, 128))
    obj_mask = cv2.resize(obj_mask, (128, 128))
    return obj_img, obj_mask

In [14]:
'''
# We have provided a template data loader for your segmentation training
# You need to complete the __getitem__() function before running the code
# You may also need to add data augmentation or normalization in here
'''

class PlaneDataset(Dataset):
    def __init__(self, set_name, data_list, flip=False):
        self.transforms = transforms.Compose([
            transforms.ToTensor(), # Converting the image to tensor and change the image format (Channels-Last => Channels-First)
            transforms.Normalize((123.675, 116.28, 103.53), (58.395, 57.12, 57.375)),
        ])
        self.transforms_flip = transforms.Compose([])
        if flip:
            self.transforms_flip = transforms.Compose([
                transforms.RandomHorizontalFlip(0.5),
                transforms.RandomVerticalFlip(0.5),
            ])
        self.set_name = set_name
        self.data = data_list
        self.instance_map = []
        for i, d in enumerate(self.data):
            for j in range(len(d['annotations'])):
                self.instance_map.append([i,j])

    '''
    # you can change the value of length to a small number like 10 for debugging of your training procedure and overfeating
    # make sure to use the correct length for the final training
    '''
    def __len__(self):
        return len(self.instance_map)

    def numpy_to_tensor(self, img, mask):
        if self.transforms is not None:
            img = self.transforms(img)
        mask = torch.tensor(mask, dtype=torch.float)
        both_images = torch.cat((img, mask.unsqueeze(0)), 0)
        both_images = self.transforms_flip(both_images)
        img, mask = both_images[0:3], both_images[3]
        return img, mask

    '''
    # Complete this part by using get_instance_sample function
    # make sure to resize the img and mask to a fixed size (for example 128*128)
    # you can use "interpolate" function of pytorch or "numpy.resize"
    # TODO: 5 lines
    '''
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        idx = self.instance_map[idx]
        data = self.data[idx[0]]

        img, mask = get_instance_sample(data, idx[1])
        img, mask = self.numpy_to_tensor(img, mask)

        return img, mask

def get_plane_dataset(set_name='train', batch_size=2, flip=False, shuffle=False):
    my_data_list = DatasetCatalog.get("data_detection_{}".format(set_name))
    dataset = PlaneDataset(set_name, my_data_list, flip=flip)
    loader = DataLoader(dataset, batch_size=batch_size, num_workers=8, pin_memory=True, shuffle=shuffle)
    return loader, dataset

### Network

In [15]:
'''
# convolution module as a template layer consists of conv2d layer, batch normalization, and relu activation
'''
from farseg import farseg_resnet50 as MyModel
# Code is too long to put here!

### Training

In [19]:
'''
# The following is a basic training procedure to train the network
# You need to update the code to get the best performance
# TODO: approx ? lines
'''

# Set the hyperparameters
num_epochs = 1000
batch_size = 32
momentum = 0.9
learning_rate = 0.1
weight_decay = 0.0001

model = MyModel() # initialize the model
# model.load_state_dict(torch.load('{}/output/99_segmentation_model_train_by_split.pth'.format(BASE_DIR)))
model = model.cuda() # move the model to GPU
loader, _ = get_plane_dataset('train', batch_size, flip=True, shuffle=True) # initialize data_loader
crit = nn.BCELoss() # Define the loss function
optim = torch.optim.SGD(model.parameters(), momentum=momentum, lr=learning_rate, weight_decay=weight_decay) # Initialize the optimizer as SGD
from torch.optim.lr_scheduler import MultiStepLR
# scheduler = CosineAnnealingLR(optim, T_max=num_epochs*len(loader), eta_min=1e-6) #learning rate decay
max_iters = num_epochs*len(loader)
scheduler = MultiStepLR(optim, milestones=[int(max_iters*0.3),int(max_iters*0.6),int(max_iters*0.8)], gamma=0.2) #learning rate decay
# start the training procedure
for epoch in range(num_epochs):
  total_loss = 0
  for (img, mask) in tqdm(loader):
    img = img.to(device=torch.device('cuda'))
    mask = mask.to(device=torch.device('cuda'))
    pred = model(img)
    loss = crit(pred, mask)
    optim.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=35, norm_type=2)
    optim.step()
    total_loss += loss.cpu().data
    scheduler.step()
  print("Epoch: {}, Loss: {}".format(epoch, total_loss/len(loader)))
  with open("lab3-part2-train.log", "a", encoding="utf8") as f:
    f.write("Epoch: {}, Loss: {}\n".format(epoch, total_loss/len(loader)))
  if epoch % 9 == 0:
    torch.save(model.state_dict(), '{}/output/{}_segmentation_model.pth'.format(BASE_DIR, epoch))
  print("Next learning rate:", scheduler.get_last_lr())

'''
# Saving the final model
'''
torch.save(model.state_dict(), '{}/output/final_segmentation_model.pth'.format(BASE_DIR))

  0%|          | 0/1596 [00:00<?, ?it/s]



RuntimeError: Given groups=1, weight of size [4, 3, 3, 3], expected input[4, 128, 128, 3] to have 3 channels, but got 128 channels instead

### Evaluation and Visualization

In [None]:
'''
# Before starting the evaluation, you need to set the model mode to eval
# You may load the trained model again, in case if you want to continue your code later
# TODO: approx 15 lines
'''
batch_size = 32
model = MyModel().cuda()
model.load_state_dict(torch.load('{}/output/999_segmentation_model_resnet_train_by_split.pth'.format(BASE_DIR)))
model = model.eval() # chaning the model to evaluation mode will fix the bachnorm layers
loader, dataset = get_plane_dataset('val', batch_size, flip=False)

total_iou = 0
images = 0
SMOOTH = 1e-6
for (img, mask) in tqdm(loader):
  with torch.no_grad():
    img = img.cuda()
    mask = mask.cuda()
    pred = model(img)
    pred = pred > 0
    mask = mask > 0.5
    
    intersection = (pred & mask).float().sum((1, 2))  # Will be zero if Truth=0 or Prediction=0
    union = (pred | mask).float().sum((1, 2))         # Will be zzero if both are 0
    
    iou = (intersection + SMOOTH) / (union + SMOOTH)  # We smooth our devision to avoid 0/0
    
    # thresholded = torch.clamp(20 * (iou - 0.5), 0, 10).ceil() / 10  # This is equal to comparing with thresolds
    thresholded = iou
    total_iou += sum(thresholded.tolist())
    images += len(thresholded.tolist())

    '''
    ## Complete the code by obtaining the IoU for each img and print the final Mean IoU
    '''


print("\n #images: {}, Mean IoU: {}".format(images, total_iou/images))


In [None]:
'''
# Visualize 3 sample outputs
# TODO: approx 5 lines
'''
import matplotlib.pyplot as plt
for (img, mask) in loader:
    break
with torch.no_grad():
  img = img.cuda()
  mask = mask.cuda()
  pred = model(img)
  pred_mask = torch.zeros_like(pred, device=pred.device)
  pred_mask[pred > 0.5] = 255
  pred *= 255
  mask *= 255
  mask = mask.cpu().numpy()
  pred = pred.cpu().numpy()
  pred_mask = pred_mask.cpu().numpy()
  img = img.cpu().numpy()
  for k in range(batch_size):
    p, pm, m, i = pred[k, ...], pred_mask[k, ...], mask[k, ...], img[k, ...]
    fig = plt.figure(figsize=(16, 4))
    ax = fig.subplots(nrows=1, ncols=4)
    ax[0].imshow(p, cmap='gray')
    ax[1].imshow(pm, cmap='gray')
    ax[2].imshow(m, cmap='gray')
    std = [58.395, 57.12, 57.375]
    mean = [123.675, 116.28, 103.53]
    for d in range(3):
      i[d, ...] = i[d, ...] * std[d] + mean[d]
    i = torch.tensor(i).permute((1,2,0)).numpy()
    ax[3].imshow(i)
    fig.savefig(os.path.join(OUTPUT_DIR, f"val_set_{k+1}.png"))
    plt.close(fig=fig)

## Part 3: Instance Segmentation

In this part, you need to obtain the instance segmentation results for the test data by using the trained segmentation model in the previous part and the detection model in Part 1.

### Get Prediction

In [None]:
'''
# Define a new function to obtain the prediction mask by passing a sample data
# For this part, you need to use all the previous parts (predictor, get_instance_sample, data preprocessings, etc)
# It is better to keep everything (as well as the output of this funcion) on gpu as tensors to speed up the operations.
# pred_mask is the instance segmentation result and should have different values for different planes.
# TODO: approx 35 lines
'''

def get_prediction_mask(data):

  return img, gt_mask, pred_mask # gt_mask could be all zero when the ground truth is not given.


### Visualization and Submission

In [None]:
'''
# Visualise the output prediction as well as the GT Mask and Input image for a sample input
# TODO: approx 10 lines
'''



In [None]:
'''
# ref: https://www.kaggle.com/rakhlin/fast-run-length-encoding-python
# https://www.kaggle.com/c/airbus-ship-detection/overview/evaluation
'''
def rle_encoding(x):
    '''
    x: pytorch tensor on gpu, 1 - mask, 0 - background
    Returns run length as list
    '''
    dots = torch.where(torch.flatten(x.long())==1)[0]
    if(len(dots)==0):
      return []
    inds = torch.where(dots[1:]!=dots[:-1]+1)[0]+1
    inds = torch.cat((torch.tensor([0], device=torch.device('cuda'), dtype=torch.long), inds))
    tmpdots = dots[inds]
    inds = torch.cat((inds, torch.tensor([len(dots)], device=torch.device('cuda'))))
    inds = inds[1:] - inds[:-1]
    runs = torch.cat((tmpdots, inds)).reshape((2,-1))
    runs = torch.flatten(torch.transpose(runs, 0, 1)).cpu().data.numpy()
    return ' '.join([str(i) for i in runs])

In [None]:
'''
# You need to upload the csv file on kaggle
# The speed of your code in the previous parts highly affects the running time of this part
'''

preddic = {"ImageId": [], "EncodedPixels": []}

'''
# Writing the predictions of the training set
'''
my_data_list = DatasetCatalog.get("data_detection_{}".format('train'))
for i in tqdm(range(len(my_data_list)), position=0, leave=True):
  sample = my_data_list[i]
  sample['image_id'] = sample['file_name'].split("/")[-1][:-4]
  img, true_mask, pred_mask = get_prediction_mask(sample)
  inds = torch.unique(pred_mask)
  if(len(inds)==1):
    preddic['ImageId'].append(sample['image_id'])
    preddic['EncodedPixels'].append([])
  else:
    for index in inds:
      if(index == 0):
        continue
      tmp_mask = (pred_mask==index)
      encPix = rle_encoding(tmp_mask)
      preddic['ImageId'].append(sample['image_id'])
      preddic['EncodedPixels'].append(encPix)

'''
# Writing the predictions of the test set
'''

my_data_list = DatasetCatalog.get("data_detection_{}".format('test'))
for i in tqdm(range(len(my_data_list)), position=0, leave=True):
  sample = my_data_list[i]
  sample['image_id'] = sample['file_name'].split("/")[-1][:-4]
  img, true_mask, pred_mask = get_prediction_mask(sample)
  inds = torch.unique(pred_mask)
  if(len(inds)==1):
    preddic['ImageId'].append(sample['image_id'])
    preddic['EncodedPixels'].append([])
  else:
    for j, index in enumerate(inds):
      if(index == 0):
        continue
      tmp_mask = (pred_mask==index).double()
      encPix = rle_encoding(tmp_mask)
      preddic['ImageId'].append(sample['image_id'])
      preddic['EncodedPixels'].append(encPix)

pred_file = open("{}/pred.csv".format(BASE_DIR), 'w')
pd.DataFrame(preddic).to_csv(pred_file, index=False)
pred_file.close()


## Part 4: Mask R-CNN

For this part you need to follow a same procedure to part 2 with the configs of Mask R-CNN, other parts are generally the same as part 2.

### Data Loader

In [None]:
'''
# This function should return a list of data samples in which each sample is a dictionary.
# Make sure to select the correct bbox_mode for the data
# For the test data, you only have access to the images, therefore, the annotations should be empty.
# Other values could be obtained from the image files.
# TODO: approx 35 lines
'''
VAL_RATE = 0.2 # Precentage of the validate size
def get_detection_data(set_name, datapath="data/train.json"):
    data_dirs = '{}/data'.format(BASE_DIR)
    # return test_set, no annotations
    if set_name == "test":
        test_set = []
        for fname in os.listdir(os.path.join(data_dirs, "test")):
            if os.path.splitext(fname)[1] == ".png":
                path = os.path.join(data_dirs, "test", fname)
                width, height = Image.open(path).size
                test_set.append({
                    "file_name": path,
                    "image_id": os.path.splitext(fname)[0],
                    "height": height,
                    "width": width,
                    "annotations": []
                })
        return test_set
    # return validate_set or train_set, with annotations
    with open(datapath) as f:
        data = json.load(f)
    validate_size = int(len(data)*VAL_RATE)
    train_annotations, validate_annotations = data[0:len(data)-validate_size], data[len(data)-validate_size:]
    annotations = validate_annotations if set_name == "val" else (data if set_name == "all" else train_annotations)
    # return validate_set or train_set, with annotations
    datadict = {}
    for annotation in annotations:
        path = os.path.join(data_dirs, "train", annotation["file_name"])
        anno = {
            "bbox": annotation["bbox"],
            "bbox_mode": BoxMode.XYWH_ABS,
            "segmentation": annotation["segmentation"],
            "category_id": annotation["category_id"],
            "iscrowd": annotation["iscrowd"],
            "area": annotation["area"]
        }
        if path in datadict:
            datadict[path]["annotations"].append(anno)
            continue
        width, height = Image.open(path).size
        datadict[path] = {
            "image_id": annotation["image_id"],
            "height": height,
            "width": width,
            "annotations": [{
                "bbox": annotation["bbox"],
                "bbox_mode": BoxMode.XYWH_ABS,
                "segmentation": annotation["segmentation"],
                "category_id": annotation["category_id"],
                "iscrowd": annotation["iscrowd"],
                "area": annotation["area"]
            }]
        }
    return [{"file_name": path, **data} for path, data in datadict.items()]

'''
# Remember to add your dataset to DatasetCatalog and MetadataCatalog
# Consdier "data_detection_train" and "data_detection_test" for registration
# You can also add an optional "data_detection_val" for your validation by spliting the training data
# TODO: approx 5 lines
'''
for i in ["train", "val", "all", "test"]:
    DatasetCatalog.register("data_detection_{}".format(i), lambda i=i: get_detection_data(i))
    MetadataCatalog.get("data_detection_{}".format(i)).set(thing_classes=["not plane 1", "not plane 2", "not plane 3", "not plane 4", "plane"])

DatasetCatalog.register("data_detection_all_ori", lambda i=i: get_detection_data("all", datapath="data/train.json"))
MetadataCatalog.get("data_detection_all_ori").set(thing_classes=["not plane 1", "not plane 2", "not plane 3", "not plane 4", "plane"])

### Set Configs

In [None]:
'''
# Set the configs for the detection part in here.
# TODO: approx 15 lines
'''
cfg = get_cfg()
# model settings
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.OUTPUT_DIR = os.path.join(BASE_DIR, 'output')
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
# training settings
cfg.DATASETS.TRAIN = ("data_detection_train",)
cfg.DATASETS.TEST = ()
cfg.SOLVER.MAX_ITER = 500
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
cfg.SOLVER.IMS_PER_BATCH = 2
cfg.SOLVER.BASE_LR = 0.00025
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml") # pretrain model

### Training

In [None]:
'''
# Create a DefaultTrainer using the above config and train the model
# TODO: approx 5 lines
'''
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

### Evaluation and Visualization

In [None]:
'''
# After training the model, you need to update cfg.MODEL.WEIGHTS
# Define a DefaultPredictor
'''
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.6
predictor = DefaultPredictor(cfg)

In [None]:
'''
# Visualize the output for 3 random test samples
# TODO: approx 10 lines
'''
test_set = get_detection_data("test")
for i in range(3):
    idx = random.randrange(0, len(test_set))
    data = test_set[idx]
    img = cv2.imread(data["file_name"])
    result = predictor(img)
    visualizer = Visualizer(img[:, :, ::-1], metadata=MetadataCatalog.get("data_detection_test"), scale=1.2, instance_mode=ColorMode.IMAGE_BW)
    result = visualizer.draw_instance_predictions(result["instances"].to("cpu"))
    img = result.get_image()[:, :, ::-1]
    cv2.imwrite(os.path.join(cfg.OUTPUT_DIR, f"test_set_{idx}.png"), img)

In [None]:
'''
# Use COCOEvaluator and build_detection_train_loader
# You can save the output predictions using inference_on_dataset
# TODO: approx 5 lines
'''
evaluator = COCOEvaluator("data_detection_val", tasks=("segm",), output_dir=cfg.OUTPUT_DIR)
test_loader = build_detection_test_loader(cfg, "data_detection_val")
print(inference_on_dataset(predictor.model, test_loader, evaluator))