In [None]:
import os, sys
import numpy as np
import pandas as pd
import time
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
import cv2
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms, utils, models, datasets
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
torch.__version__

In [None]:
RANDOM_STATE = 100
BATCH_SIZE = 16
EPOCHS = 1
LR = 1e-3
WD = 1e-5
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'

### 讀取資料集 - [VOC](https://drive.google.com/drive/folders/1TrJjsoIZ3QWecvOLGKCSa4NBk3qnObin)
- [VOCDetection](https://pytorch.org/vision/stable/generated/torchvision.datasets.VOCDetection.html)
- https://medium.com/codex/implementing-r-cnn-object-detection-on-voc2012-with-pytorch-b05d3c623afe
- ChatGPT

In [None]:
class VOCDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.labels = ['train', 'car', 'chair', 'pottedplant', 'horse', 'cat', 'cow', 'bus', 'bicycle', 'person', 'dog', 'tvmonitor',
                       'bird', 'motorbike', 'boat', 'aeroplane', 'sofa', 'sheep', 'bottle', 'diningtable']

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        img = self.dataset[index][0]
        annotation = self.dataset[index][1]['annotation']
        
        # output
        target = {'image_id': torch.Tensor([int(annotation['filename'][:-4])]), 'boxes':[], 'labels':[], 'area':[]}
        for obj in annotation['object']:
            #label
            target['labels'].append(self.labels.index(obj['name']))

            # 物件區域
            xmin = int(obj['bndbox']['xmin'])
            ymin = int(obj['bndbox']['ymin'])
            xmax = int(obj['bndbox']['xmax'])
            ymax = int(obj['bndbox']['ymax'])
            target['boxes'].append([xmin, ymin, xmax, ymax])

            # 面積大小                        
            target["area"].append((xmax - xmin) * (ymax - ymin))
    
        target["boxes"] = torch.FloatTensor(target["boxes"])
        target["labels"] = torch.LongTensor(target["labels"])
        target["area"] = torch.FloatTensor(target["area"])
        
        # 圖形長寬
        # w, h, d = list(map(int, annotation['size'].values()))
        target["masks"] = torch.zeros((len(annotation['object']), 300, 300), dtype=torch.float32)

        return img, target

In [None]:
# voc dataset前處理
transform = transforms.Compose([transforms.PILToTensor(), transforms.ConvertImageDtype(torch.float), transforms.CenterCrop(300)])
# VOC
dir_path = '/git/Kaiyang/hw4data'
voc_trainset = VOCDataset(datasets.VOCDetection(dir_path, year="2007", image_set="train", download=False, transform=transform))
voc_valset = VOCDataset(datasets.VOCDetection(dir_path, year="2007", image_set="val", download=False, transform=transform))
voc_testset = VOCDataset(datasets.VOCDetection(dir_path, year="2007", image_set="test", download=False, transform=transform))


print(f'訓練資料: {len(voc_trainset)}筆')
print(f'驗證資料: {len(voc_valset)}筆')
print(f'測試資料: {len(voc_testset)}筆')

In [None]:
# 蒐集labels
# labels_list = []
# for i, data in enumerate(voc_trainset):
#     label = data[1]['annotation']['object'][0]['name']
#     if label not in labels_list:
#         labels_list.append(label)
# print(labels_list)

### 讀取資料集 - [ADE20K](https://drive.google.com/drive/folders/1hRy6am8KeUWW_6sgj46_Qk7_vbWDWhRT)

In [None]:
# class ADEDataset(Dataset):
#     def __init__(self, path, filename, transform):
#         self.path = path
#         self.image_list = self.read_image(os.path.join(path, filename))
#         self.transform = transform
    
#     def read_image(self, filename):
#         image_list = []
#         with open(filename, 'r') as f:
#             lines = f.readlines()
#             for line in lines:
#                 img = line.rstrip().split(' ')[0]
#                 image_list.append(img)
#         return image_list
    
#     def __len__(self):
#         return len(self.path)
    
#     def __getitem__(self, idx):
#         # load images and masks
#         image_name= self.image_list[idx]
#         img_path = os.path.join(self.root, "imgs", f"ADE_val_{image_name}.jpg")
#         img = Image.open(img_path).convert("RGB")
#         json_path = os.path.join(self.root, "jsons", f"ADE_val_{image_name}.json")
#         mask_path = os.path.join(self.root, "instance_mask_backup")
        
#         with open(json_path) as f:
#             img_data = json.load(f)

#         label = []
#         masks = []
#         boxes = []
#         for obj in img_data['annotation']['object']:
#             id = obj['id']
#             obj_name = obj['name'].split(",")
#             for single_name in obj_name:    
#                 single_name = single_name.strip() 
#                 if  single_name in classes.keys():
#                     xmin = min(obj['polygon']['x'])
#                     xmax = max(obj['polygon']['x'])
#                     ymin = min(obj['polygon']['y'])
#                     ymax = max(obj['polygon']['y'])
#                     if xmin == xmax or ymin == ymax:
#                         break
                        
#                     boxes.append([xmin, ymin, xmax, ymax])
                    
#                     label.append(classes[single_name])

#                     instance_path = os.path.join(mask_path, obj["instance_mask"])
#                     mask = np.array(Image.open(instance_path))
#                     masks.append(mask)
#                     break

#         boxes = np.array(boxes)
#         # convert everything into a torch.Tensor
#         area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
#         # suppose all instances are not crowd

#         target = {}
#         target["boxes"] = torch.as_tensor(boxes, dtype=torch.float32)
#         target["labels"] = torch.as_tensor(np.array(label), dtype=torch.int64) - 1
#         target["masks"] = torch.as_tensor(np.array(masks), dtype=torch.uint8)
#         target["image_id"] = torch.tensor([idx])
#         target["area"] =torch.as_tensor(area, dtype=torch.uint8)
#         target["iscrowd"] = torch.zeros((len(boxes),), dtype=torch.int64)

#         if self.transforms is not None:
#             img, target = self.transforms(img, target)

#         return img, target

In [None]:
# ade_trainset = ADEDataset(dir_path, 'train.txt', transform=transform)
# ade_valset = ADEDataset(dir_path, 'val.txt', transform=transform)
# ade_testset = ADEDataset(dir_path, 'test.txt', transform=transform)

# print(f'訓練資料: {len(ade_trainset)}筆')
# print(f'驗證資料: {len(ade_valset)}筆')
# print(f'測試資料: {len(ade_testset)}筆')

### set DataLoader

In [None]:
# def collate_fn(batch):
#     images = [item[0] for item in batch]
#     targets = [item[1] for item in batch]

#     imgs = []
#     for image in images:
#         imgs.append(image)

#     bndboxes = [target["boxes"] for target in targets]
#     labels = [target["labels"] for target in targets]
#     image_ids = [target["id"] for target in targets]
#     areas = [target["bndbox"] for target in targets]
#     masks = [target["masks"] for target in targets]

#     tars = []
#     for i in range(len(batch)):
#         box = bndboxes[i]
#         label = labels[i]
#         image_id = image_ids[i]
#         area = areas[i]
#         mask = masks[i] 
#         target = {"image_id": image_id, "labels": label, "boxes": box, "area": area, "masks": mask}
#         tars.append(target)

#     return imgs, tars

def collate_fn(batch):
    return tuple(zip(*batch))

train_loader = DataLoader(voc_trainset, batch_size=BATCH_SIZE, shuffle = True, collate_fn=collate_fn)
val_loader = DataLoader(voc_valset, batch_size=BATCH_SIZE, shuffle = False, collate_fn=collate_fn)
test_loader = DataLoader(voc_testset, batch_size=BATCH_SIZE, shuffle = False, collate_fn=collate_fn)

### Dual Task
- 物件檢測(Object Detection): 目標為檢測出圖像中所有的物體，並且給出物體的位置和邊界框
- 語義分割(Semantic Segmentation): 目標為將圖像中的每個像素分配到相應的語義類別中

### [Mask_RCNN](https://github.com/matterport/Mask_RCNN)
- Faster R-CNN的two-stage模型加上Feature Pyramid Network(FPN)的方法，利用不同維度下特徵層級高的feature maps來進行預測
- 改良Faster R-CNN中ROI Pooling的缺點，使其邊界框和物體定位的經度可以真正達到像素等級
- [參考網址](https://ivan-eng-murmur.medium.com/%E7%89%A9%E4%BB%B6%E5%81%B5%E6%B8%AC-s9-mask-r-cnn-%E7%B0%A1%E4%BB%8B-99370c98de28)
- [如何使用MRCNN](https://github.com/sagieppel/Train_Mask-RCNN-for-object-detection-in_In_60_Lines-of-Code/blob/main/test.py)

In [None]:
# 使用pre-trained MRCNN
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True) 
in_features = model.roi_heads.box_predictor.cls_score.in_features 
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes=20)
model.to(DEVICE)

# set the optimizer
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LR, weight_decay=WD)

# show model architecture
# model.train()

### train - Object Detection

In [None]:
class trainer():
    def __init__(self, model, train_loader, val_loader, test_loader, epochs, optimizer):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.test_loader = test_loader
        self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
        self.optimizer = optimizer
        self.epochs = epochs
        
    def train(self):
        self.train_loss_history = []
        self.valid_acc_history = []
        
        # Training loop
        for epoch in range(self.epochs):
            train_loss = 0.0
            for i, data in enumerate(self.train_loader):
                images, targets = data
                images = list(image.to(self.device) for image in images)
                targets = [{key: value.to(self.device) for key, value in t.items()} for t in targets]
                # Zero the parameter gradients
                self.optimizer.zero_grad()
                
                # model forward
                loss_dict = self.model(images, targets)

                # Backward pass and optimization
                losses = sum(loss for loss in loss_dict.values())
                losses.backward()
                self.optimizer.step()
                train_loss += losses.item()
            
            # valid data
            self.model.eval()
            correct = 0
            total = 0
            for i, data in enumerate(self.val_loader):
                # read data
                images, targets = data
                images = list(image.to(self.device) for image in images)
                targets=[{key: value.to(self.device) for key, value in t.items()} for t in targets]
                
                # model forward
                outputs = self.model(images)
                outputs = [{key: value.to('cpu') for key, value in t.items()} for t in outputs]
                labels = targets[0]['labels'].to('cpu').numpy()
                outputs = outputs[0]['labels'].to('cpu').numpy()
                for i in range(len(labels)):
                    if labels[i] == outputs[i]:
                        correct += 1
                    total += 1
                    
            train_loss = train_loss / len(self.train_loader)
            val_acc = correct / total
            self.train_loss_history.append(train_loss)
            print(f"--------------------Epoch {epoch+1}--------------------")
            print(f"Train_loss: {train_loss:.3f} | Val_acc: {val_acc:.3f}")
            
        # print the curves of the training loss and validation loss
        self.plot_loss()
    
    def predict(self):
        self.model.eval()
        correct = 0
        total = 0
        for i, data in enumerate(self.test_loader):
            # read data
            images, targets = data
            images = list(image.to(self.device) for image in images)
            targets=[{key: value.to(self.device) for key, value in t.items()} for t in targets]

            # model forward
            outputs = self.model(images)
            outputs = [{key: value.to('cpu') for key, value in t.items()} for t in outputs]
            labels = targets[0]['labels'].to('cpu').numpy()
            outputs = outputs[0]['labels'].to('cpu').numpy()
            for i in range(len(labels)):
                if labels[i] == outputs[i]:
                    correct += 1
                total += 1
                
        acc = correct / total
        print(f"----------------------------------------------------")
        print(f'Test acc: {acc:.3f}')
        
    def plot_loss(self):
        plt.plot(self.train_loss_history)
        plt.title('Loss History')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.legend(['train'], loc='upper left')
        plt.show()

In [None]:
mrcnn_trainer = trainer(model, train_loader, val_loader, test_loader, EPOCHS, optimizer)
mrcnn_trainer.train()
mrcnn_trainer.predict()

### train - Semantic Segmentation

In [None]:
train_loader2 = DataLoader(ade_trainset, batch_size=BATCH_SIZE, shuffle = True, collate_fn=collate_fn)
val_loader2 = DataLoader(ade_valset, batch_size=BATCH_SIZE, shuffle = False, collate_fn=collate_fn)
test_loader2 = DataLoader(ade_testset, batch_size=BATCH_SIZE, shuffle = False, collate_fn=collate_fn)

In [None]:
# mrcnn_trainer2 = trainer(model, train_loader2, val_loader2, test_loader2, EPOCHS, optimizer)
# mrcnn_trainer2.train()
# mrcnn_trainer2.predict()