# MASK-RCNN - Public Domain + SD Generated Train/Val Dataset

# 0. Setup

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# % denotes shell command

main_path = '/content/drive/MyDrive/researchpaper2023/01_scripts/' # create file in directory
project_folder = 'MASK_RCNN_v00/'
maskrcnn_running_code = 'maskrcnn_running_code/'
cython = 'cython/'
vision = 'vision/'

# %cd {main_path}{project_folder}
# %pip install cython
# # Install pycocotools, the version by default in Colab
# # has a bug fixed in https://github.com/cocodataset/cocoapi/pull/354
# %pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'

In [4]:
# torch.cuda is used to set up and run CUDA operations
# torch.cuda keeps track of the currently selected GPU, and all CUDA tensors you allocate 
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# 1. Data Preparation


### Define paths for data

In [5]:
# manually upload data in your project folder and denote images and mask file paths

dataset_path = '/content/drive/MyDrive/researchpaper2023/00_datasets/combined_train_dataset/train/'
images_path = '/content/drive/MyDrive/researchpaper2023/00_datasets/combined_train_dataset/train/images/'
mask_path = '/content/drive/MyDrive/researchpaper2023/00_datasets/combined_train_dataset/train/mask_content'

dataset_test_path = '/content/drive/MyDrive/researchpaper2023/00_datasets/testaug_voc_combined/test/'
images_test_path = dataset_test_path + 'images/'
labels_test_path = dataset_test_path + 'labels/'
mask_test_path = dataset_test_path + 'mask_content/'

model_weights_path = '/content/drive/MyDrive/researchpaper2023/01_scripts/MASK_RCNN_v00/results/'

%cd {main_path}{project_folder}

/content/drive/MyDrive/researchpaper2023/01_scripts/MASK_RCNN_v00


In [6]:
import os 

trainval_count = 0
for filename in os.listdir(images_path): # trainval dataset
  trainval_count += 1

print('Total Number of Training/Val Images: ', trainval_count) # 2159

Total Number of Training/Val Images:  2159


In [7]:
import os 
def count_images(path):
    # path is the path to the images
    # count the number of images
    count = len([name for name in os.listdir(path) if os.path.isfile(os.path.join(path, name))])
    # return the number of images
    return count

In [8]:
VOC_test = count_images('/content/drive/MyDrive/researchpaper2023/00_datasets/VOC_dataset/test/images')
pubaug_test = count_images('/content/drive/MyDrive/researchpaper2023/00_datasets/public_dataset/test_augmented/images')

print('this is the VOC test dataset', VOC_test)
print('this is the pub_aug test dataset', pubaug_test)

this is the VOC test dataset 1000
this is the pub_aug test dataset 252


# Create masks for training/val from images and label dataset (Public + SD)


In [9]:
import os
from PIL import Image, ImageDraw
import os

def make_masks(base_dir):
    test_label_directory = base_dir + 'labels/'
    test_image_directory = base_dir + 'images/'
    test_mask_content_output = base_dir + 'mask_content/'
    os.makedirs(test_mask_content_output, exist_ok=True)
    total_count = 0
    for filename in os.listdir(test_label_directory):
        if filename.endswith(".txt"):
            total_count += 1

    count = 0
    for filename in os.listdir(test_label_directory):
        if filename.endswith(".txt"):
            count += 1
            if count % 100 == 1 or count == total_count:
                print(count, "/", total_count)
            # for filename, replace the extension with .jpg and open the file
            image_file = os.path.join(test_image_directory, filename.replace(".txt", ".jpg"))
            # open the image file at image_file path with PIL
            image = Image.open(image_file)
            # get width and height of the image
            width, height = image.size

            # open the file
            f = open(os.path.join(test_label_directory, filename), "r")
            # read the file
            lines = f.readlines()
            # close the file
            f.close()
            # get the polygon coordinates
            polygons = []
            for line in lines:
                polygon = line.split(" ")
                # Skip the first item in the list
                polygon = [float(v) for v in polygon[1:]]
                # put even element as x and odd element as y inside a list of tuples
                polygon = [(polygon[i] * width, polygon[i+1] * height) for i in range(0, len(polygon), 2)]
                polygons.append(polygon)
            
            # make a new image of the same size as `image`, fill it with black
            mask_image = Image.new('RGB', image.size, (0, 0, 0))
            # draw the polygon on the mask image, use the index as the r channel of the fill color
            for i, polygon in enumerate(polygons):
                # use a unique color for this polygon
                ImageDraw.Draw(mask_image).polygon(polygon, fill=(max(255 - i, 0), 255 - i if i > 255 else 0, 0))
            # save the mask image
            mask_image.save(os.path.join(test_mask_content_output, filename.replace(".txt", ".png")))

In [10]:
# make_masks("/content/drive/MyDrive/researchpaper2023/00_datasets/combined_train_dataset/train/")
# make_masks("/content/drive/MyDrive/researchpaper2023/00_datasets/testaug_voc_combined/test/")
count = 0
for i in os.listdir("/content/drive/MyDrive/researchpaper2023/00_datasets/testaug_voc_combined/test/mask_content"):
  count += 1
print(count)

1252


In [11]:
import os
import numpy as np
import torch
import torch.utils.data
from PIL import Image

# create torch dataset class

class Greentea(torch.utils.data.Dataset):
    def __init__(self, root, transforms=None):
        self.root = root
        self.transforms = transforms
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = list(sorted(os.listdir(os.path.join(root, "images"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "mask_content"))))

    def __getitem__(self, idx):
        # load images and masks
        img_path = os.path.join(self.root, "images", self.imgs[idx])
        mask_path = os.path.join(self.root, "mask_content", self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        # note: this image is using all three channels R, G and B to represent
        # a unique polygon.
        mask_img = Image.open(mask_path)

        mask = np.array(mask_img)
        # Combine R, G and B into the same channel and delete/reshape the mask
        # to be a 2D array of floats.
        mask[:, :, 0] = mask[:, :, 0] + mask[:, :, 1] * 256 + mask[:, :, 2] * 256 * 256 #red component + green component * 256 + blue component *256ˆ2; replace red component in overall
        mask = np.delete(mask, [1, 2], axis=2).reshape(mask.shape[0], mask.shape[1]) # delete g, b, and alpha component

        # instances are encoded as different colors
        obj_ids = np.unique(mask)

        # first id is the background, so remove it
        obj_ids = obj_ids[1:]

        # split the color-encoded mask into a set
        # of binary masks
        masks = mask == obj_ids[:, None, None]

        # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []

        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])

            is_boxes_toothin = abs(xmax - xmin) > 2
            is_boxes_tooshort = abs(ymax - ymin) > 2

            if is_boxes_toothin and is_boxes_tooshort: 
              boxes.append([xmin, ymin, xmax, ymax])

        is_boxes_empty = len(boxes) == 0

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        boxes = torch.reshape(boxes, (boxes.size(dim =0), 4))
        # there is only one class
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)

        image_id = torch.tensor([idx])

        if is_boxes_empty:
          area = torch.as_tensor([], dtype=torch.float32)
          area = torch.reshape(area, (0, 4))
        else:
          area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # print(area)
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.imgs)

## Finetuning from a pretrained model

In [12]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

      
def get_instance_segmentation_model(num_classes):
    # load an instance segmentation model pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

    # get the number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features

    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,
                                                       num_classes)

    return model

## Training and evaluation functions

In [13]:
# Download TorchVision repo to use some files from
# references/detection

# %cd {main_path}{project_folder}
# !rm -rf {vision}
# !git clone https://github.com/pytorch/vision.git {vision}
# %cd {vision}

# # Get the right version of torchvision
# !git checkout v0.13.1

# # !mkdir -p {maskrcnn_running_code}

# Copy over the helper files we need.
# %cp references/detection/utils.py {main_path}{project_folder}{maskrcnn_running_code}
# %cp references/detection/transforms.py {main_path}{project_folder}{maskrcnn_running_code}
# %cp references/detection/coco_eval.py {main_path}{project_folder}{maskrcnn_running_code}
# %cp references/detection/engine.py {main_path}{project_folder}{maskrcnn_running_code}
# %cp references/detection/coco_utils.py {main_path}{project_folder}{maskrcnn_running_code}

In [14]:
%cd {main_path}{project_folder}{maskrcnn_running_code}

from engine import train_one_epoch, evaluate
import utils
import transforms as T

from torchvision.transforms import functional as F

# Add custom ToTensor taken from an older version of torchvision 0.8.2
# This still works, they just decided to remove that helper for some reason.
class ToTensor(object):
    def __call__(self, image, target):
        image = F.to_tensor(image)
        return image, target

def get_transform(train):
    transforms = []
    # converts the image, a PIL image, into a PyTorch Tensor
    transforms.append(ToTensor())
    if train:
        # during training, randomly flip the training images
        # and ground-truth for data augmentation
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

/content/drive/MyDrive/researchpaper2023/01_scripts/MASK_RCNN_v00/maskrcnn_running_code


In [15]:
# set model variables 
# copy Forensic Architecture on these parameters

batch_size = 32 # 64
training_epochs = 20 
IMG_SIZE = (512, 512) 
IMG_SHAPE = IMG_SIZE + (3,) # add a third dimension for RGB channel
classes = 2
include_top = False
weights = None
pooling = None
learning_rates = 1e-4
# acc = torchmetrics.functional.accuracy(preds, target, task="multiclass", num_classes=5)


In [16]:
# Train-validation split while keeping the order of the images
train_data_portion = 0.8 # Proportion of data for training # 80/20

# Get integer number of training samples
train_images_index = round(trainval_count*train_data_portion)

# use our dataset and defined transformations
trainval_dataset = Greentea(dataset_path, get_transform(train=False)) # dataset_path

# split the dataset in train and val set
torch.manual_seed(42) # used to be 1
indices = torch.randperm(len(trainval_dataset)).tolist()
train_ds = torch.utils.data.Subset(trainval_dataset, indices[:train_images_index]) 
val_ds = torch.utils.data.Subset(trainval_dataset, indices[train_images_index:])

# define training and validation data loaders
train_dl = torch.utils.data.DataLoader(
    train_ds, batch_size=batch_size, shuffle=True, num_workers=4,
    collate_fn=utils.collate_fn)

val_dl = torch.utils.data.DataLoader(
    val_ds, batch_size=batch_size, shuffle=True, num_workers=4,
    collate_fn=utils.collate_fn)

print('Train-validation split:')
print('{} batches in training dataset and {} batches in validation dataset'.format(len(train_dl), len(val_dl)))

Train-validation split:
54 batches in training dataset and 14 batches in validation dataset


Now let's instantiate the model and the optimizer

In [17]:
# our dataset has two classes only - background and bottle
num_classes = 2

# get the model using our helper function
model = get_instance_segmentation_model(num_classes)
# move model to the right device
model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad] 
optimizer = torch.optim.SGD(params, lr=learning_rates,
                            momentum=0.9, weight_decay=0.0005)

# and a learning rate scheduler which decreases the learning rate by
# 10x every 3 epochs
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1) # Ishaani: consider removing if not needed 

Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /root/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth


  0%|          | 0.00/170M [00:00<?, ?B/s]

In [18]:
# write multiple lists as columns to a new csv file
def write_csv(filename, *args):
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        for arg in args:
            writer.writerow(arg)

# https://pytorch.org/docs/stable/generated/torch.nn.BCELoss.html

!pip install torchmetrics
import torch
import torchmetrics
from torch.optim.lr_scheduler import StepLR # let's not use step scheduler
from torchmetrics.classification import BinaryAUROC
from torchmetrics.detection.mean_ap import MeanAveragePrecision

from time import time
torch.cuda.empty_cache()
num_epochs = 20 # number of epochs; 
# criterion = torch.nn.BCELoss() # binary cross entropy loss function
# metric = BinaryAUROC(thresholds=None)
# train_map_metric = MeanAveragePrecision(compute_on_cpu=True)
# val_map_metric = MeanAveragePrecision()
# try map 50 and map50:95

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.11.3-py3-none-any.whl (518 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.6/518.6 KB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.11.3


## Training

In [26]:
torch.cuda.empty_cache()

model.train()
for epoch in range(num_epochs):
  # train for one epoch, printing every 10 iterations
  train_one_epoch(model, optimizer, train_dl, device, epoch, print_freq=10)
  # update the learning rate
  lr_scheduler.step()
  # torch.cuda.empty_cache()
  # evaluate on the val dataset
  evaluate(model, val_dl, device=device)

Epoch: [0]  [ 0/54]  eta: 0:02:05  lr: 0.000000  loss: 0.7769 (0.7769)  loss_classifier: 0.1526 (0.1526)  loss_box_reg: 0.2462 (0.2462)  loss_mask: 0.3481 (0.3481)  loss_objectness: 0.0225 (0.0225)  loss_rpn_box_reg: 0.0075 (0.0075)  time: 2.3174  data: 1.3485  max mem: 20641
Epoch: [0]  [10/54]  eta: 0:00:48  lr: 0.000000  loss: 0.6462 (0.6313)  loss_classifier: 0.1144 (0.1224)  loss_box_reg: 0.1655 (0.1749)  loss_mask: 0.3165 (0.3074)  loss_objectness: 0.0208 (0.0207)  loss_rpn_box_reg: 0.0060 (0.0059)  time: 1.1005  data: 0.1876  max mem: 20641
Epoch: [0]  [20/54]  eta: 0:00:35  lr: 0.000000  loss: 0.6462 (0.6361)  loss_classifier: 0.1125 (0.1209)  loss_box_reg: 0.1655 (0.1739)  loss_mask: 0.3165 (0.3147)  loss_objectness: 0.0180 (0.0205)  loss_rpn_box_reg: 0.0057 (0.0061)  time: 0.9784  data: 0.0712  max mem: 20641
Epoch: [0]  [30/54]  eta: 0:00:24  lr: 0.000000  loss: 0.6643 (0.6453)  loss_classifier: 0.1219 (0.1238)  loss_box_reg: 0.1761 (0.1795)  loss_mask: 0.3186 (0.3150)  loss

In [27]:
# save weights to .pt file
index = 0 
p = '/content/drive/MyDrive/researchpaper2023/01_scripts/MASK_RCNN_v00/results/run' + str(index) + ".pt"
while os.path.exists(p):
  index += 1
  p = '/content/drive/MyDrive/researchpaper2023/01_scripts/MASK_RCNN_v00/results/run' + str(index) + ".pt"
torch.save(model.state_dict(), p) # weights are saved in .pt file
print("Model saved to", p)

Model saved to /content/drive/MyDrive/researchpaper2023/01_scripts/MASK_RCNN_v00/results/run5.pt


In [None]:
# TODO: Compute test accuracy: true positives and true negatives

Load model from disk, if needed

In [None]:
# # runs pre-built model

# existing_weights_file = 'final_run.pt'
# model.load_state_dict(torch.load(model_weights_path + existing_weights_file))

# Not tested by Ben yet
Evaluate our model on the test dataset

In [28]:
test_ds = Greentea(dataset_test_path, get_transform(train=False))

test_dl = torch.utils.data.DataLoader(
    test_ds, batch_size=batch_size, shuffle=True, num_workers=4,
    collate_fn=utils.collate_fn)

In [29]:
evaluate(model, test_dl, device=device)

creating index...
index created!
Test:  [ 0/40]  eta: 0:02:55  model_time: 2.0349 (2.0349)  evaluator_time: 1.2153 (1.2153)  time: 4.3955  data: 1.1138  max mem: 20641
Test:  [39/40]  eta: 0:00:03  model_time: 2.0900 (2.0078)  evaluator_time: 1.4220 (1.3992)  time: 3.4484  data: 0.0519  max mem: 20641
Test: Total time: 0:02:22 (3.5682 s / it)
Averaged stats: model_time: 2.0900 (2.0078)  evaluator_time: 1.4220 (1.3992)
Accumulating evaluation results...
DONE (t=0.57s).
Accumulating evaluation results...
DONE (t=0.69s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.052
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.125
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.030
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.072
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | 

<coco_eval.CocoEvaluator at 0x7fbd582efa30>

In [25]:
from torchvision import models
from torchsummary import summary

print(model)

MaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(in