## Prepare Data for ViT-based estimation method

used code in ViT-pytorch/

In [55]:
import argparse
import datetime
import json
import random
import time
from pathlib import Path
import os, sys
import numpy as np
import torch

from main import build_model_main
from util.slconfig import SLConfig
from datasets import build_dataset
from util.visualizer import COCOVisualizer
from util import box_ops
import pickle
import copy
import random
from util.utils import slprint, to_device
import util.misc as utils
from engine import evaluate
from torch.utils.data import DataLoader
from datasets import build_dataset, get_coco_api_from_dataset

In [2]:
split = "val"
base_path = "./data/5_scale_31/"
data_path = base_path + split + "/data/"
annotation_path = base_path + split + "/box_annotation/"

In [63]:
d = torch.zeros((4,3,2))
d[1,0,0] = 1
print(d)
d = d.permute(1,0,2)
print(d)
d = d.reshape((d.shape[0], -1))
d.shape

tensor([[[0., 0.],
         [0., 0.],
         [0., 0.]],

        [[1., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.]]])
tensor([[[0., 0.],
         [1., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]]])


torch.Size([3, 8])

In [3]:
train_data_path = "./data/5_scale_31/train/data/"
test_data_path = "./data/5_scale_31/val/data/"
train_annotation_path = "./data/5_scale_31/train/box_annotation/"
test_annotation_path = "./data/5_scale_31/val/box_annotation/"
train_feature_path = base_path + "train/feature_data/"
test_feature_path = base_path + "val/feature_data/"

In [4]:
def read_one_image_results(path):
    with open(path, "r") as outfile:
        data = json.load(outfile)
    return data

def write_one_results(path, json_data):
    with open(path, "w") as outfile:
        json.dump(json_data, outfile)
        
def get_numpy_data(data_path, annotation_path, img_nums):
    X = None
    Y = None
    for img_idx in range(img_nums):
        results = read_one_image_results(data_path + str(img_idx) + ".json")
        pred_logits = np.array(results['input']['pred_logits'])
        pred_boxes = np.array(results['input']['pred_boxes'])
        pred_results = np.concatenate((pred_boxes, pred_logits), axis=2)
        annotation_data = read_one_image_results(annotation_path + str(img_idx) + ".json")
        selected_index = annotation_data['selected_index']
        out_results = pred_results[:,selected_index]
        loss = annotation_data['loss']
        pred_logits_max = np.max(pred_logits, axis=2).squeeze()
        sort_indexs = np.argsort(-pred_logits_max)
        topk_indexs = sort_indexs[:196]
        pred_results = pred_results[:,topk_indexs]
        arrSortedIndex  = np.lexsort((pred_results[:,:,0], pred_results[:,:,1])).squeeze()
        pred_results = pred_results[:,arrSortedIndex]
        one_X = None
        for i in range(out_results.shape[1]):
            temp = np.append(out_results[:,i], pred_results)
            temp = temp.reshape((1,pred_results.shape[1]+1, pred_results.shape[2]))
            if one_X is None:
                one_X = temp
            else:
                one_X = np.concatenate((one_X, temp), axis=0)
        if X is None:
            X = one_X
        else:
            X = np.concatenate((X, one_X), axis=0)
        if Y is None:
            Y = loss
        else:
            Y = np.concatenate((Y, loss))
        if img_idx % 100 == 0:
            print(f"{img_idx} finished")
    return X, Y

In [5]:
def np_read(file):
    with open(file, "rb") as outfile:
        data = np.load(outfile)
    return data
def np_write(data, file):
    with open(file, "wb") as outfile:
        np.save(outfile, data)

In [6]:
def get_feature_data(data_path, annotation_path, feature_path, img_nums):
    X = None
    Y = None
    for img_idx in range(img_nums):
        feature = np_read(feature_path + str(img_idx) + ".npy")
        new_feature = np.zeros((1, feature.shape[1], feature.shape[2]*feature.shape[0]))
        steps = feature.shape[2]
        for i in range(feature.shape[0]):
            new_feature[:,:,(steps*i):(steps*i+steps)] = feature[i]
        results = read_one_image_results(data_path + str(img_idx) + ".json")
        pred_logits = np.array(results['input']['pred_logits'])
        pred_boxes = np.array(results['input']['pred_boxes'])
        pred_results = np.concatenate((pred_boxes, pred_logits), axis=2)
        annotation_data = read_one_image_results(annotation_path + str(img_idx) + ".json")
        selected_index = annotation_data['selected_index']
        loss = annotation_data['loss']
        pred_logits_max = np.max(pred_logits, axis=2).squeeze()
        sort_indexs = np.argsort(-pred_logits_max)
        topk_indexs = sort_indexs[:196]
        pred_results = pred_results[:,topk_indexs]
        query_feature = new_feature[:,topk_indexs]
        arrSortedIndex  = np.lexsort((pred_results[:,:,0], pred_results[:,:,1])).squeeze()
        query_feature = query_feature[:,arrSortedIndex]
        one_X = None
        for i in selected_index:
            temp = np.concatenate((np.expand_dims(new_feature[:,i], axis=1), query_feature), axis=1)
            if one_X is None:
                one_X = temp
            else:
                one_X = np.concatenate((one_X, temp), axis=0)
        if X is None:
            X = one_X
        else:
            X = np.concatenate((X, one_X), axis=0)
        if Y is None:
            Y = loss
        else:
            Y = np.concatenate((Y, loss))
        if img_idx % 100 == 0:
            print(f"{img_idx} finished")
    return X, Y

# def prepare_feature_data(data_path, annotation_path, feature_path, img_nums, stored_path):
#     Y = None
#     count = 0
#     for img_idx in range(img_nums):
#         feature = np_read(feature_path + str(img_idx) + ".npy")
#         new_feature = np.zeros((1, feature.shape[1], feature.shape[2]*feature.shape[0]))
#         steps = feature.shape[2]
#         for i in range(feature.shape[0]):
#             new_feature[:,:,(steps*i):(steps*i+steps)] = feature[i]
#         results = read_one_image_results(data_path + str(img_idx) + ".json")
#         pred_logits = np.array(results['input']['pred_logits'])
#         pred_boxes = np.array(results['input']['pred_boxes'])
#         pred_results = np.concatenate((pred_boxes, pred_logits), axis=2)
#         annotation_data = read_one_image_results(annotation_path + str(img_idx) + ".json")
#         selected_index = annotation_data['selected_index']
#         loss = annotation_data['loss']
#         pred_logits_max = np.max(pred_logits, axis=2).squeeze()
#         sort_indexs = np.argsort(-pred_logits_max)
#         topk_indexs = sort_indexs[:196]
#         pred_results = pred_results[:,topk_indexs]
#         query_feature = new_feature[:,topk_indexs]
#         arrSortedIndex  = np.lexsort((pred_results[:,:,0], pred_results[:,:,1])).squeeze()
#         query_feature = query_feature[:,arrSortedIndex]
#         for i in selected_index:
#             one_X = np.concatenate((new_feature[:,i], query_feature.squeeze(axis=0)), axis=0)
#             np_write(one_X, stored_path + str(count) + ".npy")
#             count += 1
#         if Y is None:
#             Y = loss
#         else:
#             Y = np.concatenate((Y, loss))
#         if img_idx % 100 == 0:
#             print(f"{img_idx} finished")
#     np_write(Y, stored_path + "annotation.npy")
#     return

def prepare_feature_data(data_path, annotation_path, feature_path, img_nums, stored_path):
    Y = None
    count = 0
    for img_idx in range(img_nums):
        feature = np_read(feature_path + str(img_idx) + ".npy")
        new_feature = np.zeros((1, feature.shape[1], feature.shape[2]*feature.shape[0]))
        steps = feature.shape[2]
        for i in range(feature.shape[0]):
            new_feature[:,:,(steps*i):(steps*i+steps)] = feature[i]
        results = read_one_image_results(data_path + str(img_idx) + ".json")
        pred_logits = np.array(results['input']['pred_logits'])
        pred_boxes = np.array(results['input']['pred_boxes'])
        pred_results = np.concatenate((pred_boxes, pred_logits), axis=2)
        annotation_data = read_one_image_results(annotation_path + str(img_idx) + ".json")
        selected_index = annotation_data['selected_index']
        loss = annotation_data['loss']
        pred_logits_max = np.max(pred_logits, axis=2).squeeze()
        sort_indexs = np.argsort(-pred_logits_max)
        topk_indexs = sort_indexs[:196]
        pred_results = pred_results[:,topk_indexs]
        query_feature = new_feature[:,topk_indexs]
        arrSortedIndex  = np.lexsort((pred_results[:,:,0], pred_results[:,:,1])).squeeze()
        query_feature = query_feature[:,arrSortedIndex].squeeze(axis=0)
        np_write(query_feature, stored_path + "feature" +str(img_idx) + ".npy")
        for i in selected_index:
            one_json = {"self_feature": new_feature[:,i].tolist(), "feature_idx": img_idx}
            write_one_results(stored_path + str(count) + ".json", one_json)
            count += 1
        if Y is None:
            Y = loss
        else:
            Y = np.concatenate((Y, loss))
        if img_idx % 100 == 0:
            print(f"{img_idx} finished")
    np_write(Y, stored_path + "annotation.npy")
    return

## region-level box ViT data

In [None]:
test_X, test_Y = get_numpy_data(test_data_path, test_annotation_path, 5000)

In [92]:
split = "val"
store_preprocess_inputs_path = base_path + split + f"/pre_data/{split}_box_level_ViT_inputs.npy"
with open(store_preprocess_inputs_path, "wb") as outfile:
    np.save(outfile, test_X)
store_preprocess_annotations_path = base_path + split + f"/pre_data/{split}_box_level_ViT_annotations.npy"
with open(store_preprocess_annotations_path, "wb") as outfile:
    np.save(outfile, test_Y)

In [94]:
test_X.shape, test_Y.shape

((49129, 197, 95), (49129,))

In [None]:
train_X, train_Y = get_numpy_data(train_data_path, train_annotation_path, 50000)

In [None]:
split = "train"
store_preprocess_inputs_path = base_path + split + f"/pre_data/{split}_box_level_ViT_inputs.npy"
with open(store_preprocess_inputs_path, "wb") as outfile:
    np.save(outfile, train_X)
store_preprocess_annotations_path = base_path + split + f"/pre_data/{split}_box_level_ViT_annotations.npy"
with open(store_preprocess_annotations_path, "wb") as outfile:
    np.save(outfile, train_Y)

## Combine training data from multiple runs

In [7]:
split = "val"
basic_path = f"./data/5_scale_31/{split}/pre_data/"
# input_files_name = ["train_box_level_ViT_inputs.npy", "train_box_level_ViT_inputs_10000.npy", "train_box_level_ViT_inputs_20000.npy"]
# annotations_files_name = ["train_box_level_ViT_annotations.npy", "train_box_level_ViT_annotations_10000.npy", "train_box_level_ViT_annotations_20000.npy"]
# output_input_file = "train_box_level_ViT_inputs_all.npy"
# output_annotations_file = "train_box_level_ViT_annotations_all.npy"

files_basic_name = f"{split}_feature_box_level_ViT_"
files_list = [0, 2000, 3000, 4000, 5000]
input_files_name = ["train_box_level_ViT_inputs.npy", "train_box_level_ViT_inputs_10000.npy", "train_box_level_ViT_inputs_20000.npy"]
annotations_files_name = ["train_box_level_ViT_annotations.npy", "train_box_level_ViT_annotations_10000.npy", "train_box_level_ViT_annotations_20000.npy"]
output_input_file = f"{split}_feature_box_level_ViT_inputs.npy"
output_annotations_file = f"{split}_feature_box_level_ViT_annotations.npy"

In [4]:
inputs_all = None
annotations_all = None
for idx in range(2):
    input_file = basic_path + files_basic_name + "inputs_" + str(files_list[idx]) + ".npy"
    annotations_file = basic_path + files_basic_name + "annotations_" + str(files_list[idx]) + ".npy"
    inputs = np_read(input_file)
    annotations = np_read(annotations_file)
    if inputs_all is None:
        inputs_all = inputs
        annotations_all = annotations
    else:
        inputs_all = np.concatenate((inputs_all, inputs), axis=0)
        annotations_all = np.concatenate((annotations_all, annotations), axis=0)
np_write(inputs_all, basic_path + output_input_file)
np_write(annotations_all, basic_path + output_annotations_file)

## Region-level feature ViT data

In [None]:
test_X, test_Y = get_feature_data(test_data_path, test_annotation_path, test_feature_path, 5000)

In [None]:
split = "val"
store_preprocess_inputs_path = base_path + split + f"/pre_data/{split}_feature_box_level_ViT_inputs.npy"
with open(store_preprocess_inputs_path, "wb") as outfile:
    np.save(outfile, test_X)
store_preprocess_annotations_path = base_path + split + f"/pre_data/{split}_feature_box_level_ViT_annotations.npy"
with open(store_preprocess_annotations_path, "wb") as outfile:
    np.save(outfile, test_Y)

## Region-level feature ViT data
split the feature into different files so that meta model can load

The annotation is still in one file

In [15]:
split = "val"
stored_file = base_path + split + "/feature_pre_data/"
prepare_feature_data(test_data_path, test_annotation_path, test_feature_path, 5, stored_file)

0 finished


## Region-level feature ViT data with queries with the smallest IoU distance

In [None]:
def prepare_feature_data(data_path, annotation_path, feature_path, img_nums, stored_path):
    Y = None
    count = 0
    for img_idx in range(img_nums):
        results = read_one_image_results(data_path + str(img_idx) + ".json")
        pred_logits = np.array(results['input']['pred_logits']).squeeze()
        pred_boxes = np.array(results['input']['pred_boxes']).squeeze()
        annotation_data = read_one_image_results(annotation_path + str(img_idx) + ".json")
        selected_index = annotation_data['selected_index']
        loss = annotation_data['loss']
        tgt_logits = pred_logits[selected_index]
        tgt_bbox = pred_boxes[selected_index]
        cost_matrix = hungarian_matching(pred_logits, pred_boxes, tgt_logits, tgt_bbox)
        topk_idx = np.argsort(cost_matrix, axis=1)[:, :196]
        self_index = np.expand_dims(np.array(selected_index), axis=1)
        final_index = np.concatenate((self_index, topk_idx), axis=1)
        for i in range(len(selected_index)):
            one_json = {"selected_idxs": final_index[i].tolist(), "feature_idx": img_idx}
            write_one_results(stored_path + str(count) + ".json", one_json)
            count += 1
        if Y is None:
            Y = loss
        else:
            Y = np.concatenate((Y, loss))
        if img_idx % 100 == 0:
            print(f"{img_idx} finished")
    np_write(Y, stored_path + "annotation.npy")
    return

In [37]:
from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
from scipy.optimize import linear_sum_assignment
def hungarian_matching(out_logits, out_bbox, tgt_logits, tgt_bbox, cost_class = 2.0, cost_bbox = 5.0, cost_giou = 2.0, focal_alpha = 0.25, cost_threshold = 2):
    """ Performs the matching
    Params:
        outputs/targets: This is a dict that contains at least these entries:
             "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits, batch_size = 1
             "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
             "lables": Tensor of dim [num_queries] with the label of each predicted box
        cost_threshold: threshold for distance between outputs and targets
    Returns:
        cost_matrix
    """
    num_queries = out_logits.shape[0]
 
    # We flatten to compute the cost matrices in a batch
    out_prob = torch.from_numpy(out_logits).sigmoid()  # [batch_size * num_queries, num_classes]
    out_bbox = torch.from_numpy(out_bbox)  # [batch_size * num_queries, 4]
    
    tgt_ids = np.argmax(tgt_logits, axis=1)
    tgt_bbox = torch.from_numpy(tgt_bbox)
    
    # Compute the classification cost.
    alpha = focal_alpha
    gamma = 2.0
    neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
    pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
    cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
    
    # Compute the L1 cost between boxes
    cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
    
    # Compute the giou cost betwen boxes            
    cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
    
    # Final cost matrix
    C = cost_bbox * cost_bbox + cost_class * cost_class + cost_giou * cost_giou
    # C = C.view(num_queries, -1)
    C = C.numpy()
    C = C.T
    return C

In [56]:
data_path = train_data_path
annotation_path = train_annotation_path
feature_path = train_feature_path
img_idx = 0
results = read_one_image_results(data_path + str(img_idx) + ".json")
pred_logits = np.array(results['input']['pred_logits']).squeeze()
pred_boxes = np.array(results['input']['pred_boxes']).squeeze()
annotation_data = read_one_image_results(annotation_path + str(img_idx) + ".json")
selected_index = annotation_data['selected_index']
loss = annotation_data['loss']
tgt_logits = pred_logits[selected_index]
tgt_bbox = pred_boxes[selected_index]
cost_matrix = hungarian_matching(pred_logits, pred_boxes, tgt_logits, tgt_bbox)
topk_idx = np.argsort(cost_matrix, axis=1)[:, :196]
self_index = np.expand_dims(np.array(selected_index), axis=1)
final_index = np.concatenate((self_index, topk_idx), axis=1)

In [57]:
final_index.shape

(11, 197)

In [54]:
final_index.shape

(11, 197)