### Import necessary packages

In [None]:
%load_ext autoreload
%autoreload 2
import pyreadstat
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import scipy
import pandas as pd
import seaborn as sns
from scipy.stats import norm, bernoulli
import matplotlib.pyplot as plt
import os, sys
import matplotlib.patheffects as pe
from utils import ols, make_width_coverage_plot, make_budget_plot, get_data, transform_features
import warnings; warnings.simplefilter('ignore')
from tqdm import tqdm

### Import the ACS PUMS dataset

In [None]:
features = ['AGEP','SCHL','MAR','DIS','ESP','CIT','MIG','MIL','ANC1P','NATIVITY','DEAR','DEYE','DREM','SEX','RAC1P', 'SOCP', 'COW']
ft = np.array(["q", "q", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c"])
income_features, income, employed = get_data(year=2019, features=features, outcome='PINCP')

### Problem setup

Split data into labeled and unlabeled subsets. Compute ground-truth value of the regression coefficient. Specify range of budgets in fractional form $n_b/n$.

In [None]:
n_all = len(income)
n_tr = int(n_all*0.5)

income_features_labeled, income_features_unlabeled, income_labeled, Y = train_test_split(income_features, income, train_size=n_tr)
income_labeled = income_labeled.to_numpy()

X = np.stack([income_features_unlabeled['AGEP'].to_numpy(), income_features_unlabeled['SEX'].to_numpy()], axis=1)
Y = Y.to_numpy()
age = income_features['AGEP'].to_numpy()
sex = income_features['SEX'].to_numpy()
theta_true = ols(np.stack([age, sex], axis=1), income.to_numpy())[0]

In [None]:
budgets = np.linspace(0.001, 0.03, 20)

### Train XGBoost model

Train XGBoost model on labeled data. Additionally train auxiliary model for predicting the magnitude of prediction error. Compute model uncertainty for unlabeled instances.

In [None]:
income_features_enc, enc = transform_features(income_features, ft)
income_features_labeled = transform_features(income_features_labeled, ft, enc)[0]
income_features_unlabeled = transform_features(income_features_unlabeled, ft, enc)[0]

dtrain = xgb.DMatrix(income_features_labeled, label=income_labeled)
tree = xgb.train({'eta': 0.3, 'max_depth': 7, 'objective': 'reg:absoluteerror'}, dtrain, 2000)
Yhat = tree.predict(xgb.DMatrix(income_features_unlabeled))

dtrain = xgb.DMatrix(income_features_labeled, label=np.abs(income_labeled - tree.predict(xgb.DMatrix(income_features_labeled))))
tree_err = xgb.train({'eta': 0.3, 'max_depth': 7, 'objective': 'reg:absoluteerror'}, dtrain, 2000)

In [None]:
predicted_errs = np.clip(tree_err.predict(xgb.DMatrix(income_features_unlabeled)), 0, np.inf)
Hessian_inv = np.linalg.inv(1/X.shape[0] * X.T @ X)
h = Hessian_inv[:,0]
uncertainty = np.abs(h.dot(X.T)) * predicted_errs

### Main experiment

Forms dataframe ```df``` with experiment results. The columns in the dataframe are:

- ```lb``` - interval lower bound

- ```ub``` - interval upper bound

- ```interval width``` - equal to ```ub``` - ```lb```

- ```coverage``` - 0/1 indicator of whether or not interval covered target

- ```estimator``` - one of ```classical```, ```uniform```, or ```active```

- ```$n_b$``` - budget size 

In [None]:
n = n_all - n_tr
num_trials = 1000
alpha = 0.1
tau = 0.001

In [None]:
results = []
columns = ["lb", "ub", "interval width", "coverage", "estimator", "$n_b$"]
temp_df = pd.DataFrame(np.zeros((3,len(columns))), columns=columns)


for j in tqdm(range(len(budgets))):
    budget = budgets[j]
    eta = budget / np.mean(uncertainty)
    probs = np.clip((1-tau)*eta*uncertainty + tau*budget, 0, 1)
    
    for i in range(num_trials):        
        xi = bernoulli.rvs(probs)
        active_labels = (Y - Yhat)*xi/probs + Yhat
        pointest_active = ols(X, active_labels)
        grads = np.zeros(X.shape)
        for i in range(n):
            grads[i,:] = (np.dot(X[i,:], pointest_active) - active_labels[i]) * X[i,:]
        V = np.cov(grads.T)
        Sigma_active = Hessian_inv @ V @ Hessian_inv
        pointest_active_std = np.sqrt(Sigma_active[0,0])/np.sqrt(n)
        width_active = norm.ppf(1-alpha/2)*pointest_active_std 
        coverage_active = (theta_true >= pointest_active[0] - width_active)*(theta_true <= pointest_active[0] + width_active)   
        temp_df.loc[0] = pointest_active[0] - width_active, pointest_active[0] + width_active, 2*width_active, coverage_active, "active", int(budget*n)

        xi_unif = bernoulli.rvs([budget]*n)
        unif_labels = (Y-Yhat)*xi_unif/budget + Yhat
        pointest_unif = ols(X, unif_labels)
        grads = np.zeros(X.shape)
        for i in range(n):
            grads[i,:] = (np.dot(X[i,:], pointest_unif) - unif_labels[i]) * X[i,:]
        V = np.cov(grads.T)
        
        Sigma_unif = Hessian_inv @ V @ Hessian_inv
        pointest_unif_std = np.sqrt(Sigma_unif[0,0])/np.sqrt(n)
        width_unif = norm.ppf(1-alpha/2)*pointest_unif_std
        coverage_unif = (theta_true >= pointest_unif[0] - width_unif)*(theta_true <= pointest_unif[0] + width_unif)
        temp_df.loc[1] = pointest_unif[0] - width_unif, pointest_unif[0] + width_unif, 2*width_unif, coverage_unif, "uniform", int(budget*n)

        class_labels = Y*xi_unif/budget
        pointest_class = ols(X, class_labels)
        grads = np.zeros(X.shape)
        for i in range(n):
            grads[i,:] = (np.dot(X[i,:], pointest_class) - class_labels[i]) * X[i,:]
        V = np.cov(grads.T)
        
        Sigma_class = Hessian_inv @ V @ Hessian_inv
        pointest_class_std = np.sqrt(Sigma_class[0,0])/np.sqrt(n)
        width_class = norm.ppf(1-alpha/2)*pointest_class_std
        coverage_class = (theta_true >= pointest_class[0] - width_class)*(theta_true <= pointest_class[0] + width_class)
        temp_df.loc[2] = pointest_class[0] - width_class, pointest_class[0] + width_class, 2*width_class, coverage_class, "classical", int(budget*n)

        results += [temp_df.copy()]
df = pd.concat(results,ignore_index=True)

### Plot coverage and interval width

In [None]:
make_width_coverage_plot(df, "regression coefficient", "widths_and_coverage_census_batch.pdf", theta_true, num_trials = num_trials, n_example_ind = 3, less_precision=True)

### Plot budget saved

In [None]:
make_budget_plot(df, "Census data analysis", "budget_census_batch.pdf")