In [1]:
import argparse
import datetime
import json
import random
import time
from pathlib import Path
import os, sys
import numpy as np

import torch
from torch.utils.data import DataLoader, DistributedSampler, random_split, TensorDataset
from torch import nn

In [2]:
split = "train"
model_data_path = "./data/5_scale_31/"
data_path = model_data_path + split + "/data/"
summary_annotation_file = model_data_path + split + "/summary.json"

In [3]:
with open(data_path + "0.json", "r") as outfile:
    data = json.load(outfile)
print(data['input'].keys())
print(data['annotation'].keys())

dict_keys(['pred_logits', 'pred_boxes', 'aux_outputs', 'interm_outputs', 'interm_outputs_for_matching_pre', 'dn_meta'])
dict_keys(['loss', 'loss_bbox_dn', 'loss_giou_dn', 'loss_ce_dn', 'loss_ce', 'loss_bbox', 'loss_giou', 'loss_ce_0', 'loss_bbox_0', 'loss_giou_0', 'loss_bbox_dn_0', 'loss_giou_dn_0', 'loss_ce_dn_0', 'loss_ce_1', 'loss_bbox_1', 'loss_giou_1', 'loss_bbox_dn_1', 'loss_giou_dn_1', 'loss_ce_dn_1', 'loss_ce_2', 'loss_bbox_2', 'loss_giou_2', 'loss_bbox_dn_2', 'loss_giou_dn_2', 'loss_ce_dn_2', 'loss_ce_3', 'loss_bbox_3', 'loss_giou_3', 'loss_bbox_dn_3', 'loss_giou_dn_3', 'loss_ce_dn_3', 'loss_ce_4', 'loss_bbox_4', 'loss_giou_4', 'loss_bbox_dn_4', 'loss_giou_dn_4', 'loss_ce_dn_4', 'loss_ce_interm', 'loss_bbox_interm', 'loss_giou_interm', 'loss_bbox_dn_unscaled', 'loss_giou_dn_unscaled', 'loss_ce_dn_unscaled', 'loss_xy_dn_unscaled', 'loss_hw_dn_unscaled', 'cardinality_error_dn_unscaled', 'loss_ce_unscaled', 'class_error_unscaled', 'loss_bbox_unscaled', 'loss_giou_unscaled', 'loss

In [4]:
# def read_one_key_from_file(file, first_key, second_key, third_key=None):
#     with open(file, "r") as outfile:
#         data = json.load(outfile)
#     return data[first_key][second_key]

In [None]:
# files = os.listdir(data_path)
# max_file_num = 10000
# files_num = min(len(files), max_file_num)
# # assert len(files) == files_num
# inputs = []
# annotations = []
# for file_idx in range(files_num):
#     file_path = data_path + str(file_idx) + ".json"
#     with open(file_path, "r") as outfile:
#         data = json.load(outfile)
#         pred_logits = np.squeeze(np.array(data['input']['pred_logits']), axis=0)
#         pred_boxes = np.squeeze(np.array(data['input']['pred_boxes']), axis=0)
#         pred_input = np.concatenate((pred_logits, pred_boxes), axis=1)
#         inputs.append(pred_input.tolist())
#         loss_ce = data['annotation']['loss_ce']
#         loss_bbox = data['annotation']['loss_bbox']
#         loss_giou = data['annotation']['loss_giou']
#         annotations.append(loss_ce + loss_bbox + loss_giou)
#     if file_idx % 1000 == 0:
#         print(file_idx)

In [7]:
# store_preprocess_inputs_path = model_data_path + split + f"/pre_data/{split}_inputs.npy"
# with open(store_preprocess_inputs_path, "wb") as outfile:
#     np.save(outfile, np.array(inputs))
# store_preprocess_annotations_path = model_data_path + split + f"/pre_data/{split}_annotations.npy"
# with open(store_preprocess_annotations_path, "wb") as outfile:
#     np.save(outfile, np.array(annotations))

## MLP model

In [3]:
class MPLNet(nn.Module):
    def __init__(self, input_dims = 10000, output_dims = 1, dropout = 0.1):
        super().__init__()
        self.layer1=nn.Linear(input_dims, 10000)
        self.layer2=nn.Linear(10000, 1000)
        self.layer3=nn.Linear(1000, 100)
        self.layer4=nn.Linear(100, 10)
        self.layer5=nn.Linear(10, 1)
        self.norm1 = nn.LayerNorm(10000)
        self.norm2 = nn.LayerNorm(1000)
        self.norm3 = nn.LayerNorm(100)
        self.norm4 = nn.LayerNorm(10)
        self._init_parms(self.layer1)
        self._init_parms(self.layer2)
        self._init_parms(self.layer3)
        self._init_parms(self.layer4)
        self._init_parms(self.layer5)
        # self.dropout1 = nn.Dropout(dropout)
        # self.dropout2 = nn.Dropout(dropout)
        # self.dropout3 = nn.Dropout(dropout)
        self.input_dims = input_dims
    
    def _init_parms(self, module):
        module.weight.data.normal_(mean=0.0, std=1.0)
        
    def forward(self, x):
        x=x.view(-1, self.input_dims)
        x=nn.functional.relu(self.norm1(self.layer1(x)))
        x=nn.functional.relu(self.norm2(self.layer2(x)))
        x=nn.functional.relu(self.norm3(self.layer3(x)))
        x=nn.functional.relu(self.norm4(self.layer4(x)))
        x=self.layer5(x)
        return x

In [4]:
device_name = "cuda:1"
device = torch.device(device_name)
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
rate_learning = 1e-4
model = MPLNet(input_dims=85500)
loss_function = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=rate_learning)
model.to(device)

MPLNet(
  (layer1): Linear(in_features=85500, out_features=10000, bias=True)
  (layer2): Linear(in_features=10000, out_features=1000, bias=True)
  (layer3): Linear(in_features=1000, out_features=100, bias=True)
  (layer4): Linear(in_features=100, out_features=10, bias=True)
  (layer5): Linear(in_features=10, out_features=1, bias=True)
  (norm1): LayerNorm((10000,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((1000,), eps=1e-05, elementwise_affine=True)
  (norm3): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
  (norm4): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
)

In [5]:
# load data
batch_size = 10
store_preprocess_inputs_path = model_data_path + split + f"/pre_data/{split}_inputs.npy"
store_preprocess_annotations_path = model_data_path + split + f"/pre_data/{split}_annotations.npy"
with open(store_preprocess_inputs_path, 'rb') as outfile:
    inputs = torch.from_numpy(np.load(outfile))
with open(store_preprocess_annotations_path, 'rb') as outfile:
    annotations = torch.from_numpy(np.load(outfile))
    
datasets = TensorDataset(inputs, annotations)
datasets_nums = inputs.shape[0]
train_nums = int(datasets_nums * 0.7)
train_datasets, val_datasets = torch.utils.data.random_split(datasets, [train_nums, datasets_nums-train_nums])
train_sampler = torch.utils.data.RandomSampler(train_datasets)
batch_sampler_train = torch.utils.data.BatchSampler(train_sampler, batch_size, drop_last=True)
train_dataloader = DataLoader(train_datasets, batch_sampler=batch_sampler_train)

val_sampler = torch.utils.data.SequentialSampler(val_datasets)
valid_dataloader = DataLoader(val_datasets, 1, sampler=val_sampler, drop_last=False)

In [6]:
for epoch in range(10):

    # Performing Training for each epoch
    training_loss = 0.
    model.train()

    # The training loop
    for step, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        input, label = batch
        input = input.to(device=device_name).type(torch.float)
        label = label.to(device=device_name).type(torch.float)[None, :]
        output = model(input)
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        training_loss += loss.item()
        if step % 100 == 0:
            print("Train Epoch: {}, Step: {}, Loss {}".format(epoch, step, loss.item()))


    # Performing Validation for each epoch
    validation_loss = 0.
    model.eval()

    # The validation loop
    for batch in valid_dataloader:
        input, label = batch
        input = input.to(device=device_name).type(torch.float)
        label = label.to(device=device_name).type(torch.float)[None, :]
        output = model(input)
        loss = loss_function(output, label)
        validation_loss += loss.item()

    # Calculating the average training and validation loss over epoch
    training_loss_avg = training_loss/len(train_dataloader)
    validation_loss_avg = validation_loss/len(valid_dataloader)

    # Printing average training and average validation losses
    print("Epoch: {}".format(epoch))
    print("Training loss: {}".format(training_loss_avg))
    print("Validation loss: {}".format(validation_loss_avg))

  return F.mse_loss(input, target, reduction=self.reduction)


Train Epoch: 0, Step: 0, Loss 1.4079663753509521
Train Epoch: 0, Step: 100, Loss 0.14491362869739532
Train Epoch: 0, Step: 200, Loss 0.07064949721097946
Train Epoch: 0, Step: 300, Loss 0.1600571721792221
Train Epoch: 0, Step: 400, Loss 0.07113819569349289
Train Epoch: 0, Step: 500, Loss 0.06636536866426468
Train Epoch: 0, Step: 600, Loss 0.0470028817653656
Epoch: 0
Training loss: 0.13012654406151603
Validation loss: 0.1096319569219806
Train Epoch: 1, Step: 0, Loss 0.09794391691684723
Train Epoch: 1, Step: 100, Loss 0.07985158264636993
Train Epoch: 1, Step: 200, Loss 0.05286634340882301
Train Epoch: 1, Step: 300, Loss 0.0969143733382225


KeyboardInterrupt: 

In [7]:
model.eval()
result_example = np.zeros(100)
for i in range(100):
    input = inputs[i].to(device=device_name).type(torch.float)
    output = model(input)
    result_example[i] = output.detach().cpu().numpy()
print(result_example)

[0.46378481 0.50836468 0.43675882 0.47644717 0.4669866  0.49336869
 0.52880955 0.41137904 0.45204777 0.5399906  0.44983721 0.49561977
 0.51709634 0.54614073 0.47704571 0.49241424 0.49363971 0.45553952
 0.4987483  0.41566736 0.48889589 0.4868024  0.50882953 0.48026282
 0.40693206 0.50247908 0.49741501 0.46098775 0.44397908 0.41126758
 0.50637001 0.51956153 0.47695261 0.47220343 0.50901115 0.51671141
 0.45579845 0.46015316 0.54054099 0.54043818 0.55663884 0.53353608
 0.46967125 0.47010213 0.51379019 0.42288727 0.46069342 0.47276914
 0.48792249 0.48977292 0.45354086 0.51540637 0.53348166 0.47597975
 0.45855373 0.44176763 0.39752394 0.45308101 0.46009493 0.44475228
 0.46692824 0.50908798 0.56615192 0.49608469 0.44961447 0.52036077
 0.57642484 0.5131554  0.40966755 0.46586293 0.50519437 0.53614908
 0.45775819 0.45283502 0.48057812 0.43737203 0.5045436  0.45854819
 0.50428152 0.42545217 0.46880782 0.47415984 0.43131417 0.52466351
 0.44499087 0.44779795 0.49688888 0.46387064 0.4989056  0.5235

In [12]:
torch.save(model.state_dict(), "ckpts/5_scale_31_mlp.pickle")

## Active testing

In [8]:
def LURE_weights_for_risk_estimator(weights, N):
    M = weights.size
    if M < N:
        m = np.arange(1, M+1)
        v = (
            1
            + (N-M)/(N-m) * (
                    1 / ((N-m+1) * weights)
                    - 1
                    )
            )
    else:
        v = 1

    return v

def acquire(expected_loss_inputs, samples_num):
    assert samples_num <= expected_loss_inputs.size
    expected_loss = np.copy(expected_loss_inputs)
    # Log-lik can be negative.
    # Make all values positive.
    if (expected_loss < 0).sum() > 0:
        expected_loss += np.abs(expected_loss.min())
    
    if np.any(np.isnan(expected_loss)):
        logging.warning(
            'Found NaN values in expected loss, replacing with 0.')
        logging.info(f'{expected_loss}')
        expected_loss = np.nan_to_num(expected_loss, nan=0)
    pick_sample_idxs = np.zeros((samples_num), dtype = int)
    idx_array = np.arange(expected_loss.size)
    weights = np.zeros((samples_num), dtype = np.single)
    uniform_clip_val = 0.2
    for i in range(samples_num):
        expected_loss /= expected_loss.sum()
        # clip all values less than 10 percent of uniform propability
        expected_loss = np.maximum(uniform_clip_val * 1/expected_loss.size, expected_loss)
        expected_loss /= expected_loss.sum()
        sample = np.random.multinomial(1, expected_loss)
        cur_idx = np.where(sample)[0][0]
        pick_sample_idxs[i] = idx_array[cur_idx]
        weights[i] = expected_loss[cur_idx]
        selected_mask = np.ones((expected_loss.size), dtype=bool)
        selected_mask[cur_idx] = False
        expected_loss = expected_loss[selected_mask]
        idx_array = idx_array[selected_mask]
    return pick_sample_idxs, weights

def get_expected_loss_MLP(inputs):
    model.eval()
    datasets_nums = inputs.shape[0]
    expected_loss = np.zeros(datasets_nums)
    for i in range(datasets_nums):
        input = inputs[i].to(device=device_name).type(torch.float)
        output = model(input)
        expected_loss[i] = output.detach().cpu().numpy()
    return expected_loss

def run_one_approximate_risk_estimator(true_losses, expected_losses, seed, samples_num):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    pick_sample_idxs, weights = acquire(expected_losses, samples_num)
    risk_estimator_weights = LURE_weights_for_risk_estimator(weights, expected_losses.size)
    sampled_true_losses = true_losses[pick_sample_idxs]

    loss_risk = (sampled_true_losses * risk_estimator_weights).mean()
    return loss_risk

def get_true_loss(annotations):
    return annotations.numpy()

In [9]:
val_split = "val"
store_preprocess_inputs_path = model_data_path + val_split + f"/pre_data/{val_split}_inputs.npy"
store_preprocess_annotations_path = model_data_path + val_split + f"/pre_data/{val_split}_annotations.npy"
with open(store_preprocess_inputs_path, 'rb') as outfile:
    val_inputs = torch.from_numpy(np.load(outfile))
with open(store_preprocess_annotations_path, 'rb') as outfile:
    val_annotations = torch.from_numpy(np.load(outfile))
true_losses = get_true_loss(val_annotations)
expected_losses = get_expected_loss_MLP(val_inputs)

In [10]:
result_json_path = "./results/MPL_loss_compare/"
sample_size_set = [50, 100, 150, 200, 250, 500, 750, 1000, 1500, 2000, 3000]
random_seed_set = [4519, 9524, 5901, 1028, 6382, 5383, 5095, 7635,  890,  608]

In [11]:
file_path = result_json_path + "MLP_R50_31_10_runs.json"
json_object = {}
for sample_size in sample_size_set:
    for seed in random_seed_set:
        result = {"active_test_type": "MLP", "sample_size": sample_size}
        loss_risk = run_one_approximate_risk_estimator(true_losses, expected_losses, seed, sample_size)
        result["loss"] = loss_risk
        json_object[len(json_object)] = result
with open(file_path, "w") as outfile:
    json.dump(json_object, outfile)

## Random Sample

In [40]:
def run_one_random_sample_risk_estimator(true_losses, seed, samples_num):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    perm = np.random.permutation(true_losses.size)
    pick_sample_idxs = perm[:samples_num]
    sampled_true_losses = true_losses[pick_sample_idxs]
    return sampled_true_losses.mean()

In [41]:
file_path = result_json_path + "random_sample_R50_31_10_runs.json"
json_object = {}
for sample_size in sample_size_set:
    for seed in random_seed_set:
        result = {"active_test_type": "random sample", "sample_size": sample_size}
        loss_risk = run_one_random_sample_risk_estimator(true_losses, seed, sample_size)
        result["loss"] = loss_risk
        json_object[len(json_object)] = result
with open(file_path, "w") as outfile:
    json.dump(json_object, outfile)

In [47]:
def run_one_random_sample_risk_estimator_LURE(true_losses, seed, samples_num):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    perm = np.random.permutation(true_losses.size)
    pick_sample_idxs = perm[:samples_num]
    sampled_true_losses = true_losses[pick_sample_idxs]
    weights = np.ones(samples_num) * 1.0 / samples_num
    risk_estimator_weights = LURE_weights_for_risk_estimator(weights, true_losses.size)
    loss_risk = (sampled_true_losses * risk_estimator_weights).mean()
    return loss_risk

In [48]:
file_path = result_json_path + "random_sample_LURE_R50_31_10_runs.json"
json_object = {}
for sample_size in sample_size_set:
    for seed in random_seed_set:
        result = {"active_test_type": "random sample LURE", "sample_size": sample_size}
        loss_risk = run_one_random_sample_risk_estimator_LURE(true_losses, seed, sample_size)
        result["loss"] = loss_risk
        json_object[len(json_object)] = result
with open(file_path, "w") as outfile:
    json.dump(json_object, outfile)

## Whole data set risk estimation

In [42]:
def get_whole_data_set_risk_estimator(true_losses):
    return true_losses.mean()

In [44]:
file_path = result_json_path + "None_R50_31.json"
result = {"active_test_type": "None", "sample_size": inputs.shape[0]}
result["loss"] = get_whole_data_set_risk_estimator(true_losses)
json_object = {}
json_object[0] = result
with open(file_path, "w") as outfile:
    json.dump(json_object, outfile)