In [2]:
import numpy as np 
import pandas as pd 
import os
from torchvision import transforms, utils, models, datasets
import json
import pickle
import torch, random, copy
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
import torchvision.transforms.functional as F
from torchmetrics.detection.mean_ap import MeanAveragePrecision

import plotly.express as pe
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from utils.engine import train_one_epoch, evaluate
from utils import utils

import torch
from PIL import Image, ImageOps
from utils import transforms as T

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
with open('ADE20k_classes.pkl', 'rb') as f:
    classes = pickle.load(f)

# Train

In [2]:
def imshow(img_pt, trues, preds=None):
    img = F.to_pil_image(img_pt)
    size = img.size
    fig = pe.imshow(img)
    for box, label_list in zip(trues["boxes"],trues["labels"]):
        label_text = classes[label_list]
        fig.add_shape(type='rect',
                      x0=box[0], x1=box[2], 
                      y0=box[1], y1=box[3],
                      xref='x', yref='y',
                      line=dict(color='green', width=2))
        fig.add_annotation(x=box[0], y=box[1], text=str(classes[label_list.item()]),
                           xref="x", yref="y", showarrow=False,
                           font_size=10, font_color='green',
                           bgcolor="white")
    if preds != None:
        for box, label_list, score in zip(preds["boxes"].cpu().tolist(), preds["labels"].cpu().tolist(), preds["scores"].cpu().tolist(),):
            label_text = classes[label_list]
            fig.add_shape(type='rect',
                          x0=box[0], x1=box[2], 
                          y0=box[1], y1=box[3],
                          xref='x', yref='y',
                          line=dict(color='red', width=2, dash='dot'))
            fig.add_annotation(x=box[2], y=box[1], text=f"{classes[label_list]} {score*100:.1f}%",
                               xref="x", yref="y", showarrow=False,
                               font_size=10, font_color='red',
                               bgcolor="white")    
    fig.show()

In [127]:
class ADEDataset(torch.utils.data.Dataset):

    def __init__(self, root,  filename, transforms):
        self.root = root
        self.transforms = transforms
        self.image_list = self.read_file(os.path.join(root, filename))
        self.len = len(self.image_list)

    def __getitem__(self, idx):
        # load images and masks
        image_name= self.image_list[idx]
        img_path = os.path.join(self.root, "imgs", f"ADE_val_{image_name}.jpg")
        img = Image.open(img_path).convert("RGB")
        json_path = os.path.join(self.root, "jsons", f"ADE_val_{image_name}.json")
        mask_path = os.path.join(self.root, "instance_mask_backup")
        
        with open(json_path) as f:
            img_data = json.load(f)

        label = []
        masks = []
        boxes = []
        for obj in img_data['annotation']['object']:
            id = obj['id']
            obj_name = obj['name'].split(",")
            for single_name in obj_name:    
                single_name = single_name.strip() 
                if  single_name in classes.keys():
                    xmin = min(obj['polygon']['x'])
                    xmax = max(obj['polygon']['x'])
                    ymin = min(obj['polygon']['y'])
                    ymax = max(obj['polygon']['y'])
                    if xmin == xmax or ymin == ymax:
                        break
                        
                    boxes.append([xmin, ymin, xmax, ymax])
                    
                    label.append(classes[single_name])

                    instance_path = os.path.join(mask_path, obj["instance_mask"])
                    mask = np.array(Image.open(instance_path))
                    masks.append(mask)
                    break

        boxes = np.array(boxes)
        # convert everything into a torch.Tensor
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd

        target = {}
        target["boxes"] = torch.as_tensor(boxes, dtype=torch.float32)
        target["labels"] = torch.as_tensor(np.array(label), dtype=torch.int64) - 1
        target["masks"] = torch.as_tensor(np.array(masks), dtype=torch.uint8)
        target["image_id"] = torch.tensor([idx])
        target["area"] =torch.as_tensor(area, dtype=torch.uint8)
        target["iscrowd"] = torch.zeros((len(boxes),), dtype=torch.int64)

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.image_list)
    
    def read_file(self,filename):
        image_list = []
        with open(filename, 'r') as f:
            lines = f.readlines()
            for line in lines:
                # rstrip：用来去除结尾字符、空白符(包括\n、\r、\t、' '，即：换行、回车、制表符、空格)
                img = line.rstrip().split(' ')[0]
                image_list.append(img)
        return image_list

In [5]:
# root = "../ADE20K"
# image_name = "00000001"
# img_path = os.path.join(root, "imgs", f"ADE_val_{image_name}.jpg")
# #mask_path = os.path.join(root, "masks", f"ADE_val_{image_name}_seg.png")
# json_path = os.path.join(root, "jsons", f"ADE_val_{image_name}.json")
# mask_path = os.path.join(root, "instance_mask_backup")#, f"ADE_val_{image_name}")

# img = Image.open(img_path).convert("RGB")
# # mask = Image.open(mask_path).convert("RGB")
# # # convert the PIL Image into a numpy array
# # mask = ImageOps.grayscale(mask)
# # obj_ids = np.unique(mask)[1:]
# # # of binary masks

# #masks = mask == obj_ids[:, None, None]
# with open(json_path) as f:
#     data = json.load(f)

# label = []
# masks = []
# for obj in data['annotation']['object']:
#     id = obj['id']
#     obj_name = obj['name'].split(",")
#     for single_name in obj_name:    
#         single_name = single_name.strip() 
#         if  single_name in classes.keys():
#             label.append(classes[single_name])
#             # print(obj_name, single_name, classes[single_name])
#             instance_path = os.path.join(mask_path, obj["instance_mask"])#f"instance_{id:03d}_ADE_val_{image_name}.png")
#             #print(id, instance_path)
#             mask = np.array(Image.open(instance_path))
#             masks.append(mask)

#             break



In [128]:
def get_transform(train):
    transforms = []
    transforms.append(T.PILToTensor())
    transforms.append(T.ConvertImageDtype(torch.float))
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

In [129]:
root = "../ADE20K"
train_data = ADEDataset(root,"train.txt", get_transform(train=True))
train_dataloader = torch.utils.data.DataLoader(
                                train_data, batch_size=2, shuffle=True, num_workers=4,
                                collate_fn=utils.collate_fn)
val_data = ADEDataset(root,"val.txt", get_transform(train=True))
val_dataloader = torch.utils.data.DataLoader(
                                val_data, batch_size=2, shuffle=True, num_workers=4,
                                collate_fn=utils.collate_fn)
test_data = ADEDataset(root,"test.txt", get_transform(train=True))
test_dataloader = torch.utils.data.DataLoader(
                                test_data, batch_size=2, shuffle=True, num_workers=4,
                                collate_fn=utils.collate_fn)


In [130]:
print(f"train dataset :{len(train_data)} images")
print(f"val dataset :{len(val_data)} images")
print(f"test dataset :{len(test_data)} images")
# print(f"the meta data in image:{voctrainval_ds[0][1].keys()}")

train dataset :1400 images
val dataset :200 images
test dataset :400 images


In [131]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor


def get_model_instance_segmentation(num_classes):
    # load an instance segmentation model pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,
                                                       num_classes)

    return model

In [132]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#device = torch.device('cpu')

lr = 0.001
batch_size = 8
weight_decay=1e-5
num_classes = 150
#model = get_model_instance_segmentation(num_classes).to(device)
model = models.detection.fasterrcnn_mobilenet_v3_large_320_fpn(score_thresh=0.5, weights_backbone=True, num_classes=num_classes).to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(params, lr = lr, weight_decay=weight_decay)




In [133]:
num_epochs = 10

lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)

for epoch in range(num_epochs):
    metric_logger = train_one_epoch(model, optimizer, train_dataloader, device, epoch=epoch, print_freq=10, scaler=None)
    lr_scheduler.step()
        # evaluate on the test dataset3
    evaluate(model, val_dataloader, device=device)

Epoch: [0]  [  0/700]  eta: 0:13:44  lr: 0.000002  loss: 6.6806 (6.6806)  loss_classifier: 5.0127 (5.0127)  loss_box_reg: 0.7323 (0.7323)  loss_objectness: 0.6922 (0.6922)  loss_rpn_box_reg: 0.2434 (0.2434)  time: 1.1777  data: 1.0438  max mem: 4223
Epoch: [0]  [ 10/700]  eta: 0:04:07  lr: 0.000017  loss: 6.6806 (6.5008)  loss_classifier: 4.8433 (4.7727)  loss_box_reg: 0.7323 (0.6945)  loss_objectness: 0.6939 (0.6941)  loss_rpn_box_reg: 0.3517 (0.3395)  time: 0.3594  data: 0.2419  max mem: 4223
Epoch: [0]  [ 20/700]  eta: 0:05:23  lr: 0.000031  loss: 5.6418 (5.6386)  loss_classifier: 4.1252 (3.9317)  loss_box_reg: 0.6255 (0.6859)  loss_objectness: 0.6913 (0.6925)  loss_rpn_box_reg: 0.2639 (0.3284)  time: 0.4403  data: 0.3216  max mem: 4223
Epoch: [0]  [ 30/700]  eta: 0:05:26  lr: 0.000045  loss: 4.4380 (5.1734)  loss_classifier: 2.5319 (3.4289)  loss_box_reg: 0.7452 (0.7107)  loss_objectness: 0.6886 (0.6907)  loss_rpn_box_reg: 0.2142 (0.3430)  time: 0.5576  data: 0.4330  max mem: 4223
