In [1]:
import sys
sys.path.insert(0, '..')

In [2]:
import torch
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torchvision.transforms.functional as FT

from tqdm import tqdm
from PIL import Image
from torchvision.datasets import VOCDetection
from xml.etree.ElementTree import parse as ET_parse
from utils.crowd.synthetic_data import generate_random_conf_matrix, generate_dl_conf_matrix, generate_box_parameters, generate_synthetic_data, xyxy2xywh
from models.rpn_generator import RPNGenerator

In [3]:
class CustomVOCDetection(VOCDetection):
    classes = ['background', 'aeroplane', 'bicycle', 'bird', 'boat',
               'bottle', 'bus', 'car', 'cat', 'chair',
               'cow', 'diningtable', 'dog', 'horse',
               'motorbike', 'person', 'pottedplant',
               'sheep', 'sofa', 'train', 'tvmonitor']

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.build_gt()
    
    def build_gt(self):
        # convert to [xmin, ymin, xmax, ymax, class_id]
        targets = []
        for index in range(len(self.annotations)):
            target = self.parse_voc_xml(ET_parse(self.annotations[index]).getroot())['annotation']['object']
            target = [[x['bndbox']['xmin'], x['bndbox']['ymin'], x['bndbox']['xmax'], x['bndbox']['ymax'], self.classes.index(x['name'])]
                      for x in target if x['difficult'] == '0']
            target = [list(map(int, x)) for x in target]
            targets.append(target)
        self.gt = targets
    
    def get_gt(self, index: int):        
        return self.gt[index]

In [4]:
import torchvision.transforms.functional as FT
from torchvision.transforms import ToTensor, ToPILImage, Resize


class StandardTransform:
    def __init__(self, image_size=512, augment=[]):
        self.image_size = image_size
        self.augment = augment
        
    def __call__(self, img, bbox):
        new_image = FT.to_tensor(img)
        new_bbox = torch.FloatTensor(bbox)
        for augment in self.augment:
            new_image, new_bbox = self.augment(new_bbox)
        
        # resize image and box
        dims = (self.image_size, self.image_size)
        new_image = FT.resize(new_image, dims)

        # Resize bounding boxes
        old_dims = torch.FloatTensor([img.width, img.height, img.width, img.height]).unsqueeze(0)
        new_bbox[:, :4] = new_bbox[:, :4] / old_dims  # percent coordinates

        new_dims = torch.FloatTensor([dims[1], dims[0], dims[1], dims[0]]).unsqueeze(0)
        new_bbox[:, :4] = new_bbox[:, :4] * new_dims

        return new_image, new_bbox
    
    
def download_voc_2007(root='../.data/'):
    voc_train = CustomVOCDetection(root=root, year='2007', image_set='trainval', download=True)
    # voc_val = CustomVOCDetection(root=root, year='2007', image_set='val', download=True)
    voc_test = CustomVOCDetection(root=root, year='2007', image_set='test', download=True)
    return voc_train, voc_test
    

In [5]:
voc_train, voc_test = download_voc_2007()

Using downloaded and verified file: ../.data/VOCtrainval_06-Nov-2007.tar
Extracting ../.data/VOCtrainval_06-Nov-2007.tar to ../.data/
Using downloaded and verified file: ../.data/VOCtest_06-Nov-2007.tar
Extracting ../.data/VOCtest_06-Nov-2007.tar to ../.data/


In [6]:
len(voc_train), len(voc_test)

(5011, 4952)

In [7]:
coco_classes = ['background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
voc_classes = ['background', 'airplane', 'bicycle', 'bird', 'boat',
               'bottle', 'bus', 'car', 'cat', 'chair',
               'cow', 'dining table', 'dog', 'horse',
               'motorcycle', 'person', 'potted plant',
               'sheep', 'couch', 'train', 'tv']

In [8]:
coco2voc_classes = [coco_classes.index(c) for c in voc_classes]

In [9]:
alexnet_cm = np.load('../outputs/classification-coco/alexnet_scratch/conf_matrix.npy')[coco2voc_classes][:, coco2voc_classes] + 1
vgg16_cm = np.load('../outputs/classification-coco/vgg16_scratch/conf_matrix.npy')[coco2voc_classes][:, coco2voc_classes] + 1
resnet50_cm = np.load('../outputs/classification-coco/resnet50_scratch/conf_matrix.npy')[coco2voc_classes][:, coco2voc_classes] + 1

In [10]:
dataloader = torch.utils.data.DataLoader(voc_train, shuffle=False, batch_size=16, collate_fn=lambda x: x)
device = torch.device('cuda')

In [11]:
# rpn proposals
resnet18_weight = '../outputs/rpn-coco/resnet18/weights/epoch_004.pt'
resnet50_weight = '../outputs/rpn-coco/resnet50/weights/epoch_004.pt'
resnet101_weight = '../outputs/rpn-coco/resnet101/weights/epoch_004.pt'
resnet18_proposals, resnet50_proposals, resnet101_proposals = [], [], []

with torch.no_grad():
    for weight, results in zip([resnet18_weight, resnet50_weight, resnet101_weight], [resnet18_proposals, resnet50_proposals, resnet101_proposals]):
        model = torch.load(weight)['model']
        model.rpn.nms_thresh = 0.9
        model.eval()
        model.to(device)
        for data in tqdm(dataloader, total=len(dataloader)):
            imgs = list(FT.to_tensor(x[0]).to(device) for x in data)
            proposals, scores, _ = model(imgs)
            for proposal, score in zip(proposals, scores):
                results.append(np.concatenate([proposal.cpu().numpy(), np.expand_dims(score.cpu().numpy(), -1)], -1))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 314/314 [02:17<00:00,  2.28it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 314/314 [04:33<00:00,  1.15it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 314/314 [06:16<00:00,  1.20s/it]


In [12]:
resnet50_proposals[0].shape

(1000, 5)

# Increasing K

In [13]:
yaml_template = """train: ./data/voc2007_abls/{}/noisy_train
clean_train: ./data/voc2007_abls/voc2007_ann1_k/clean_train
val: ./data/voc2007_abls/voc2007_ann1_k/test
test: ./data/voc2007_abls/voc2007_ann1_k/test
image_dir: ./.data/VOCdevkit/VOC2007/JPEGImages

# number of classes
nc: 20

# number of classes by annotator
nc_ann: 20

# number of annotators
n_annotator: {}

earl_ann_weights: {}  # all weights are same

# class names
names: ['aeroplane', 'bicycle', 'bird', 'boat',
        'bottle', 'bus', 'car', 'cat', 'chair',
        'cow', 'diningtable', 'dog', 'horse',
        'motorbike', 'person', 'pottedplant',
        'sheep', 'sofa', 'train', 'tvmonitor']

# augmentations settings
hsv_h: 0.015  # image HSV-Hue augmentation (fraction)
hsv_s: 0.7  # image HSV-Saturation augmentation (fraction)
hsv_v: 0.4  # image HSV-Value augmentation (fraction)
rotate: 10  # image rotation (+/- deg)
translate: 0.1  # image translation (+/- fraction)
scale: 0.1  # image scale (+/- gain)
shear: 0  # image shear (+/- deg)
perspective: 0.0  # image perspective (+/- fraction), range 0-0.001
flipud: 0.0  # image flip up-down (probability)
fliplr: 0.5  # image flip left-right (probability)"""

In [14]:
# create synthetic train dataset

import os
import pandas as pd

def crowd_labels2df(crowd_labels, dataset):
    temp_data = []
    for crowd_label, img_file in zip(crowd_labels, dataset.images):
        img_file = os.path.basename(img_file)
        for ann_id, annotations in enumerate(crowd_label):
            for box in annotations:
                temp_data.append([img_file, *box, ann_id])

    df = pd.DataFrame(temp_data, columns=['img_path', 'x1', 'y1', 'x2', 'y2', 'class_id', 'annotator_id'])
    df['class_id'] = df['class_id'] - 1
    # for x in ['x1', 'y1', 'x2', 'y2']:
    #     df[x] = df[x] - 1
    return df

def crowd_labels_df2txt(df, path='../data/voc2007'):
    os.makedirs(path, exist_ok=True)
    for filename, group in df.groupby('img_path'):
        filename += '.txt'
        group.drop(columns='img_path').to_csv(os.path.join(path, filename), index=False, header=False)


In [15]:
def clean2df(dataset):
    temp_data = []
    for gt, img_file in zip(dataset.gt, dataset.images):
        img_file = os.path.basename(img_file)
        # print(gt)
        for box in gt:
            temp_data.append([img_file, *box])

    df = pd.DataFrame(temp_data, columns=['img_path', 'x1', 'y1', 'x2', 'y2', 'class_id'])
    df['class_id'] = df['class_id'] - 1
    for x in ['x1', 'y1', 'x2', 'y2']:
        df[x] = df[x] - 1
    return df


In [16]:
import json

def df_to_cocojson(df, path='../data/voc2007'):
    temp = []
    id_ = 0
    for filename, group in df.groupby('img_path'):
        box_list = group.to_numpy()
        box_list[:, 1:5] = xyxy2xywh(box_list[:, 1:5])
        for box in box_list:
            image_id = int(box[0].split('.')[0])
            bbox = box[1:5]
            
            class_id = box[5]
            temp.append({'image_id': image_id,
                         'category_id': class_id,
                         'bbox': [round(x, 3) for x in bbox],
                         'iscrowd': 0, 'id': id_, 'area': box[2] * box[3]})
            id_ += 1
    with open(path, 'w') as f:
        json.dump(temp, f)

In [17]:
for k in [1, 2, 5, 10, 50, 100, 500, 1000]:
    conf_matrix = generate_dl_conf_matrix(k, vgg16_cm)
    rpn_proposals = {i: resnet50_proposals for i in range(k)}
    crowd_labels, conf_matrix, box_params = generate_synthetic_data(voc_train, n_annotator=k, conf_matrix=conf_matrix, fixed_box=False,
                                                                    rpn_proposals=rpn_proposals)
    df = crowd_labels2df(crowd_labels, voc_train)
    save_path = f'../data/voc2007_abls/voc2007_ann{k}_k'
    os.makedirs(save_path, exist_ok=True)
    np.save(os.path.join(save_path, 'conf_matrix.npy'), conf_matrix)
    crowd_labels_df2txt(df, path=os.path.join(save_path, 'noisy_train'))
    with open(f'../data/voc2007_abls/voc_2007_ann{k}_k.yaml', 'w') as f:
        f.write(yaml_template.format(f'voc2007_ann{k}_k', k, '[]'))
    if k == 1:  # only save same df once to save disk space
        train_clean_df = clean2df(voc_train)
        crowd_labels_df2txt(train_clean_df, path=os.path.join(save_path, 'clean_train'))
        test_df = clean2df(voc_test)
        crowd_labels_df2txt(test_df, path=os.path.join(save_path, 'test'))
        df_to_cocojson(test_df, os.path.join(save_path, 'instances_test.json'))

# Decreasing Reliability

In [None]:
k = 25
seed = 1234
for nr in [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]:
    n_expert = int((1.0 - nr) * k)
    n_noise = k - n_expert
    print(n_expert, n_noise, nr)
    exp_conf_matrix = generate_dl_conf_matrix(n_expert, resnet50_cm, seed=seed)
    noise_conf_matrix = generate_dl_conf_matrix(n_noise, alexnet_cm, seed=seed+1)
    conf_matrix = np.concatenate((exp_conf_matrix, noise_conf_matrix), axis=0)
    rpn_proposals = {}
    for i in range(n_expert):
        rpn_proposals[i] = resnet101_proposals
    for i in range(n_expert, k):
        rpn_proposals[i] = resnet18_proposals
    assert len(rpn_proposals) == conf_matrix.shape[0]
    crowd_labels, conf_matrix, box_params = generate_synthetic_data(voc_train, n_annotator=k, conf_matrix=conf_matrix, fixed_box=False,
                                                                    rpn_proposals=rpn_proposals)
    df = crowd_labels2df(crowd_labels, voc_train)
    save_path = f'../data/voc2007_abls/voc2007_ann{k}_nr{nr:.1f}'
    os.makedirs(save_path, exist_ok=True)
    np.save(os.path.join(save_path, 'conf_matrix.npy'), conf_matrix)
    crowd_labels_df2txt(df, path=os.path.join(save_path, 'noisy_train'))
    earl_weights = [[n_expert, 0.7364], [k, 0.3465]]
    with open(f'../data/voc2007_abls/voc_2007_ann{k}_nr{nr:.1f}.yaml', 'w') as f:
        f.write(yaml_template.format(f'voc2007_ann{k}_nr{nr:.1f}', k, str(earl_weights)))