## Test different sklearn model on estimating loss

The inputs and annotation data from MLP_model_demo.ipynb

In [16]:
import json
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import copy
import sklearn
import random
import torch

## Active Testing

In [9]:
def LURE_weights_for_risk_estimator(weights, N):
    M = weights.size
    if M < N:
        m = np.arange(1, M+1)
        v = (
            1
            + (N-M)/(N-m) * (
                    1 / ((N-m+1) * weights)
                    - 1
                    )
            )
    else:
        v = 1

    return v

def acquire(expected_loss_inputs, samples_num):
    assert samples_num <= expected_loss_inputs.size
    expected_loss = np.copy(expected_loss_inputs)
    # Log-lik can be negative.
    # Make all values positive.
    if (expected_loss < 0).sum() > 0:
        expected_loss += np.abs(expected_loss.min())
    
    if np.any(np.isnan(expected_loss)):
        logging.warning(
            'Found NaN values in expected loss, replacing with 0.')
        logging.info(f'{expected_loss}')
        expected_loss = np.nan_to_num(expected_loss, nan=0)
    pick_sample_idxs = np.zeros((samples_num), dtype = int)
    idx_array = np.arange(expected_loss.size)
    weights = np.zeros((samples_num), dtype = np.single)
    uniform_clip_val = 0.2
    for i in range(samples_num):
        expected_loss /= expected_loss.sum()
        # clip all values less than 10 percent of uniform propability
        expected_loss = np.maximum(uniform_clip_val * 1/expected_loss.size, expected_loss)
        expected_loss /= expected_loss.sum()
        sample = np.random.multinomial(1, expected_loss)
        cur_idx = np.where(sample)[0][0]
        pick_sample_idxs[i] = idx_array[cur_idx]
        weights[i] = expected_loss[cur_idx]
        selected_mask = np.ones((expected_loss.size), dtype=bool)
        selected_mask[cur_idx] = False
        expected_loss = expected_loss[selected_mask]
        idx_array = idx_array[selected_mask]
    return pick_sample_idxs, weights

def get_expected_loss(inputs, model):
    expected_loss = model.predict(inputs)
    return expected_loss

def run_one_approximate_risk_estimator(true_losses, expected_losses, seed, samples_num):
    # torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    pick_sample_idxs, weights = acquire(expected_losses, samples_num)
    risk_estimator_weights = LURE_weights_for_risk_estimator(weights, expected_losses.size)
    sampled_true_losses = true_losses[pick_sample_idxs]

    loss_risk = (sampled_true_losses * risk_estimator_weights).mean()
    return loss_risk

def write_one_results(path, json_data):
    with open(path, "w") as outfile:
        json.dump(json_data, outfile)

def read_one_results(path):
    with open(path, "r") as outfile:
        data = json.load(outfile)
    return data

In [10]:
result_json_path = "./results/MPL_loss_compare/"
sample_size_set = [50, 100, 150, 200, 250, 500, 750, 1000, 1500, 2000, 3000]
random_seed_set = [4519, 9524, 5901, 1028, 6382, 5383, 5095, 7635,  890,  608]
def active_testing(file_path, true_losses, expected_losses, active_test_type):
    json_object = {}
    for sample_size in sample_size_set:
        for seed in random_seed_set:
            result = {"active_test_type": active_test_type, "sample_size": sample_size}
            loss_risk = run_one_approximate_risk_estimator(true_losses, expected_losses, seed, sample_size)
            result["loss"] = loss_risk
            json_object[len(json_object)] = result
    with open(file_path, "w") as outfile:
        json.dump(json_object, outfile)

## Load data

In [11]:
model_data_path = "./data/5_scale_31/"
train_data_path = model_data_path + "train/pre_data/"
test_data_path = model_data_path + "val/pre_data/"
with open(train_data_path + "train_inputs.npy", 'rb') as outfile:
    train_X = np.load(outfile)
train_X = train_X.reshape((train_X.shape[0], -1))
with open(train_data_path + "train_annotations.npy", 'rb') as outfile:
    tran_Y = np.load(outfile)
with open(test_data_path + "val_inputs.npy", 'rb') as outfile:
    test_X = np.load(outfile)
test_X = test_X.reshape((test_X.shape[0], -1))
with open(test_data_path + "val_annotations.npy", 'rb') as outfile:
    test_Y = np.load(outfile)

## Random Sample

In [14]:
def run_one_random_sample_risk_estimator(true_losses, seed, samples_num):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    perm = np.random.permutation(true_losses.size)
    pick_sample_idxs = perm[:samples_num]
    sampled_true_losses = true_losses[pick_sample_idxs]
    return sampled_true_losses.mean()

In [21]:
file_path = result_json_path + "random_sample_R50_31_10_runs.json"
json_object = {}
for sample_size in sample_size_set:
    for seed in random_seed_set:
        result = {"active_test_type": "random sample", "sample_size": sample_size}
        loss_risk = run_one_random_sample_risk_estimator(test_Y, seed, sample_size)
        result["loss"] = loss_risk
        json_object[len(json_object)] = result
with open(file_path, "w") as outfile:
    json.dump(json_object, outfile)

## Whole data set risk estimation

In [22]:
def get_whole_data_set_risk_estimator(true_losses):
    return true_losses.mean()

In [23]:
file_path = result_json_path + "None_R50_31.json"
result = {"active_test_type": "None", "sample_size": test_Y.shape[0]}
result["loss"] = get_whole_data_set_risk_estimator(test_Y)
json_object = {}
json_object[0] = result
with open(file_path, "w") as outfile:
    json.dump(json_object, outfile)

## ASE
extimated loss from active_test_demo.ipynb

In [13]:
results = read_one_results("./results/loss_analysis/" + "image_based_ASE.json")
ase_expected_loss = np.array(results["estimated loss"])
true_losses = test_Y
file_path = result_json_path + "ASE_R50_31_10_runs.json"
active_testing(file_path, true_losses, ase_expected_loss, "ASE")

## Ensemble Gradient Boosting

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html#sklearn.ensemble.GradientBoostingRegressor

In [3]:
from sklearn.ensemble import GradientBoostingRegressor
gb_reg = GradientBoostingRegressor(random_state=0, loss='absolute_error')
gb_reg.fit(train_X[:1000], tran_Y[:1000])

GradientBoostingRegressor(loss='absolute_error', random_state=0)

In [6]:
gb_reg.predict(test_X[1:2])

array([0.16718003])

In [7]:
gb_reg.score(test_X, test_Y)

-0.10270405057616716

In [9]:
true_losses = test_Y
expected_losses = get_expected_loss(test_X, gb_reg)

In [11]:
file_path = result_json_path + "EGB_R50_31_10_runs.json"
active_testing(file_path, true_losses, expected_losses, "EGB")

## AdaBoost Regressor
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html#sklearn.ensemble.AdaBoostRegressor

In [5]:
from sklearn.ensemble import AdaBoostRegressor
abr_reg = AdaBoostRegressor(random_state=0, n_estimators=100)
abr_reg.fit(train_X[:1000], tran_Y[:1000])

AdaBoostRegressor(n_estimators=100, random_state=0)

In [6]:
abr_reg.predict(test_X[1:2])

array([0.4134521])

In [7]:
abr_reg.score(test_X, test_Y)

-0.09421608247547097

In [8]:
true_losses = test_Y
expected_losses = get_expected_loss(test_X, abr_reg)

In [9]:
file_path = result_json_path + "ABR_R50_31_10_runs.json"
active_testing(file_path, true_losses, expected_losses, "ABR")

## Redge Regression
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge

In [5]:
from sklearn.linear_model import RidgeCV
redge_reg = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1])
redge_reg.fit(train_X[:10000], tran_Y[:10000])

RidgeCV(alphas=array([0.001, 0.01 , 0.1  , 1.   ]))

In [6]:
redge_reg.score(test_X, test_Y)

-0.4727245391842507

In [14]:
true_losses = test_Y
expected_losses = get_expected_loss(test_X, redge_reg)
file_path = result_json_path + "RedgeReg_R50_31_10_runs.json"
active_testing(file_path, true_losses, expected_losses, "RR")

In [8]:
# true_losses = test_Y
# expected_losses = get_expected_loss(test_X, redge_reg)
# loss_analysis_path = "./results/loss_analysis/" + "image_based_regde_regression.json"
# json_object = {"true loss": true_losses.tolist(), "estimated loss": expected_losses.tolist()}
# write_one_results(loss_analysis_path, json_object)

## Isotonic Regression

https://scikit-learn.org/stable/modules/generated/sklearn.isotonic.IsotonicRegression.html

In [9]:
from sklearn.isotonic import IsotonicRegression
redge_reg_X = redge_reg.predict(train_X[5000:10000])

In [25]:
iso_reg = IsotonicRegression(y_min = tran_Y[5000:10000].min(), y_max = tran_Y[5000:10000].max(), out_of_bounds='clip')
iso_reg.fit(redge_reg_X, tran_Y[5000:10000])

IsotonicRegression(out_of_bounds='clip', y_max=4.05604875087738,
                   y_min=8.709733810974285e-05)

In [26]:
redge_reg_test_X = redge_reg.predict(test_X)

In [31]:
true_losses = test_Y
expected_losses = get_expected_loss(redge_reg_test_X, iso_reg)
file_path = result_json_path + "IsReg_R50_31_10_runs.json"
active_testing(file_path, true_losses, expected_losses, "IS")

## Random Sample with image level but set the threshold for box level, also use image-level loss

In [25]:
%%capture
from util.slconfig import SLConfig
from datasets import build_dataset
model_config_path = "config/DINO/DINO_5scale.py"
args = SLConfig.fromfile(model_config_path) 
args.dataset_file = 'coco'
args.coco_path = "../coco/" # the path of coco
args.fix_size = False

dataset_val = build_dataset(image_set="val", args=args) 

In [26]:
def dataset_label_num_lists(dataset):
    label_nums = []
    for i in range(len(dataset)):
        _, target = dataset[i]
        label_nums.append(target['labels'].shape[0])
    return label_nums

def run_one_random_sample_risk_estimator_for_image_based(true_losses, label_nums, seed, samples_num):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    perm = np.random.permutation(len(true_losses))
    sampled_true_losses = []
    count_label_num = 0
    for i in range(perm.shape[0]):
        if count_label_num >= samples_num:
            break
        sampled_true_losses.append(true_losses[perm[i]])
        count_label_num += label_nums[perm[i]]
    sampled_true_losses = np.array(sampled_true_losses)
    return sampled_true_losses.mean()

In [27]:
label_nums = dataset_label_num_lists(dataset_val)

In [29]:
box_labels_nums = 36335
sample_size_precentage = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
sample_size_set = (np.array(sample_size_precentage) * box_labels_nums).astype(int).tolist()
file_path = result_json_path + "random_sample_image_level_loss_R50_31_10_runs.json"
json_object = {}
for sample_size in sample_size_set:
    for seed in random_seed_set:
        result = {"active_test_type": "random sample image level loss", "sample_size": sample_size}
        loss_risk = run_one_random_sample_risk_estimator_for_image_based(test_Y, label_nums, seed, sample_size)
        result["loss"] = loss_risk
        json_object[len(json_object)] = result
with open(file_path, "w") as outfile:
    json.dump(json_object, outfile)