In [293]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from utils import *

dir_data = '../data/regr'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Optimization-based approach
Let R denote (a parameterization of) our estimate for the region. The idea is to maximize $$\textrm{vol}(R) - \lambda \sum_{i=1}^n f_i(R),$$ where $f_i(R)$ are functions which penalize R for including rejected points and potentially reward R for including non-rejected points.

Note: We may want to include a normalized volume in the objective. When the region is small, the gradient of the volume will be close to vanishing, but when the region is large, the gradient of the volume will be huge.

## Hard-thresholding method
We first hard-threshold each training point based on a residual cutoff on the core fit, giving a binary label $r_i \in \{0,1\}$ which is 1 if the i-th point is rejected and 0 if it is not. We then define $$f_i(R) = r_i \exp(-c_1 d(x_i, R)) - (1-r_i) \exp(-c_2 d(x_i, R)).$$
Here $c_i$ are constants which control how much a rejected point is penalized (resp. a non-rejected point is rewarded) for being close to the approximate region. Note that for this particular objective, once a point is included in R, it stops contributing to the gradient.

# Load data
Here we load the full dataset, determine the train/test split, and also find the "core" group. We also preprocess to add a bias term to the features and center the features so they have mean 0. Lastly, we find the bounding box for the data, which will be used as an "outer bound" for each method.

The names of the options and their (n, d) values are:

- Dutch_drinking_inh (12121, 16)
- Dutch_drinking_wm (12131, 16)
- Dutch_drinking_sha (12098, 16)
- Brazil_health_heart (7728, 6)
- Brazil_health_stroke (9675, 6)
- Korea_grip (1022, 11)
- China_glucose_women2 (4568, 11)
- China_glucose_men2 (4360, 11)
- Spain_Hair (529, 5)
- China_HIV (2410, 27)

We also include a 'Synthetic' option. If this is selected, then the dir_data argument should be replaced with [n, d] specifying the number of data points to be generated and the desired dimensionality.

In [294]:
dataset = 'Synthetic'
dir_data = [10000, 4]

# dataset = 'Brazil_health_stroke'


if dataset == 'Synthetic':
    n, d = dir_data
    w    = np.ones(d)
    bias = 10.
    std  = 0.3

    R = np.ones((d, 2)) * (3 ** (-1/d))
    R[:, 0] *= -1

    X = 2 * np.random.rand(n, d) - 1
    Y = std * np.random.randn(n)
    for i in range(n):
        if in_box(X[i], R):
            Y[i] += np.dot(X[i], w) + bias

    X_core = (6 ** (-1/d)) * (2 * np.random.rand(int(n / 10), d) - 1)
    Y_core = X_core @ w + bias + np.random.randn(int(n / 10))
    names_covariates = [X_core, Y_core, R]

    B = np.ones((d, 2))
    B[:, 0] *= -1

else:
    X, Y, names_covariates = load_regr_data(dataset)

    n = len(Y)
    d = len(X[0])

    X -= np.mean(X, axis = 0) # We're cheating a little bit here because we also center the test data; easy to fix
    B = np.column_stack([np.min(X, axis = 0), np.max(X, axis = 0)])

X = np.concatenate([X, np.ones((n, 1))], axis = 1)

test_ind = np.random.choice(range(n), size = int(n / 10), replace = False)
train_ind = [i for i in range(n) if i not in test_ind]

X_test = X[test_ind].copy()
Y_test = Y[test_ind].copy()

X_train = X[train_ind].copy()
Y_train = Y[train_ind].copy()

assert len(Y_test) + len(Y_train) == n

if dataset == 'Synthetic':
    X_core, Y_core, true_R = names_covariates
    X_core = np.concatenate([X_core, np.ones((len(X_core), 1))], axis = 1)
else:
    X_core, Y_core = find_core(X_train, Y_train)

beta, min_eig, s_hat = core_fit(X_core, Y_core)
n_core = len(Y_core)

# Compute point inclusion labels
Using the core fit, we determine the inclusion/exclusion labels for each point. These can be 0/1 labels for hard cutoff methods, or soft labels or log p values (these latter two have not been implemented yet).

In [295]:
alpha = 0.05

hg_labels = hard_grow_labels(X, Y, alpha, s_hat, min_eig, n_core, beta)
ho_labels = hard_opt_labels(X, Y, alpha, s_hat, min_eig, n_core, beta)

# Compute good region
Using the labels defined above, approximate the "good" region.

In [296]:
hg_args = [None]

init_R = 0.1 * torch.ones(d, 2)
init_R[:, 0] *= -1

reg   = 10. / n
iters = 100
lr    = 0.01
c1    = 1.
c2    = 1.

ho_args = [init_R, reg, iters, lr, c1, c2]


hg_R = hard_grow_region(X[:, :-1], hg_labels, B, hg_args)
ho_R = hard_opt_region(torch.tensor(X[:, :-1]), torch.tensor(ho_labels), torch.tensor(B), ho_args)

tensor([[-0.1000,  0.1000],
        [-0.1000,  0.1000],
        [-0.1000,  0.1000],
        [-0.1000,  0.1000]], requires_grad=True)
tensor([[-0.1853,  0.1842],
        [-0.1863,  0.1839],
        [-0.1884,  0.1862],
        [-0.1872,  0.1855]], requires_grad=True)
tensor([[-0.2669,  0.2654],
        [-0.2693,  0.2646],
        [-0.2737,  0.2690],
        [-0.2715,  0.2679]], requires_grad=True)
tensor([[-0.3444,  0.3427],
        [-0.3482,  0.3411],
        [-0.3551,  0.3477],
        [-0.3518,  0.3465]], requires_grad=True)
tensor([[-0.4174,  0.4157],
        [-0.4228,  0.4130],
        [-0.4317,  0.4214],
        [-0.4272,  0.4205]], requires_grad=True)
tensor([[-0.4843,  0.4835],
        [-0.4923,  0.4785],
        [-0.5023,  0.4899],
        [-0.4970,  0.4890]], requires_grad=True)
tensor([[-0.5433,  0.5453],
        [-0.5554,  0.5365],
        [-0.5654,  0.5526],
        [-0.5591,  0.5511]], requires_grad=True)
tensor([[-0.5950,  0.5999],
        [-0.6104,  0.5869],
        [-0.6

# Test results
Check how good the region selected by each method is. For all datasets, we can look at the MAE of the model on the selected region. For the synthetic case, we know the "correct" region to select, and we can compute precision and recall for the region itself.

In [297]:
# Baselines
baseline_beta = np.linalg.solve(X_train.T @ X_train, X_train.T @ Y_train)
base_MAE = test_MAE(X_test, Y_test, baseline_beta, B)

hg_MAE = test_MAE(X_test, Y_test, beta, hg_R)
ho_MAE = test_MAE(X_test, Y_test, beta, ho_R)

hg_incl = np.sum(in_box(X_test[:, :-1], hg_R)) / len(X_test)
ho_incl = np.sum(in_box(X_test[:, :-1], ho_R)) / len(X_test)

if dataset == 'Synthetic':
    true_vol = box_intersection(true_R, true_R)

    hg_vol = box_intersection(hg_R, hg_R)
    hg_intersect_vol = box_intersection(hg_R, true_R)

    ho_vol = box_intersection(ho_R, ho_R)
    ho_intersect_vol = box_intersection(ho_R, true_R)

    hg_prec = hg_intersect_vol / hg_vol
    hg_rec  = hg_intersect_vol / true_vol
    
    ho_prec = ho_intersect_vol / ho_vol
    ho_rec  = ho_intersect_vol / true_vol

print('Results')
print(f'Baseline MAE: {base_MAE}')
print('---------------------------------')
print('Hard thresholding + growing box:')
print(f'MAE: {hg_MAE}')
print(f'Incl. frac: {hg_incl}')
if dataset == 'Synthetic':
    print(f'Region precision: {hg_prec}')
    print(f'Region recall: {hg_rec}')
print('---------------------------------')
print('Hard thresholding + optimization:')
print(f'MAE: {ho_MAE}')
print(f'Incl. frac: {ho_incl}')
if dataset == 'Synthetic':
    print(f'Region precision: {ho_prec}')
    print(f'Region recall: {ho_rec}')

Results
Baseline MAE: 4.434643256347318
---------------------------------
Hard thresholding + growing box:
MAE: 0.23709609549318472
Incl. frac: 0.325
Region precision: 0.9972650786138773
Region recall: 1.0
---------------------------------
Hard thresholding + optimization:
MAE: 0.2370989354458773
Incl. frac: 0.261
Region precision: 1.0
Region recall: 0.809971432249787
