# Horseshoe Priors: Sparse Models with Failure of Convergence

In this notebook, we use Bayesian regression with horseshoe priors to model [corn data](https://core.ac.uk/download/pdf/397803.pdf). The model is implemented using the publicly available [Stan code](https://github.com/yao-yl/Evaluating-Variational-Inference/blob/master/Rcode/glmbernoullirhs.stan) (additional details can be found in our paper). 
For this model the quality of the variational approximation is sensitive to random initialization and the stochastic variation during the optimization, so that occasionally the posterior is reasonable whereas for some runs it converges toa very bad solution. We fix it with help of a decision-maker (neural network) showing that it is capable of correcting errors in posterior approximation.

### Imports

In [2]:
import time
import pickle
import pystan
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

In [3]:
from aux import tonumpy, parse_script_args, dict2str, print2
import losses
import horseshoe_regression
import regression_data

In [4]:
if torch.cuda.is_available():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
    env = torch.cuda
    device = torch.device('cuda')
    print("Using GPU")
else:
    torch.set_default_tensor_type('torch.FloatTensor')
    env = torch
    device = torch.device('cpu')
    print("Using CPU")
    
    
def totorch(array):
    return torch.tensor(array, dtype=env.float32)

Using CPU


### Configuration

In [5]:
args = parse_script_args() # arguments can be passed in the format of NAME1=FLOATVAL1,NAME2=[STRVAL2],NAME3=INTVAL3,...

parsing: <-f>


In [6]:
# optimization general parmeters
SEED = args.get("SEED", 5)
VI_NITER = int(args.get("VI_NITER", 1e6))
HMC_NITER = int(args.get("HMC_NITER", 1e4))

In [7]:
# selected loss: tilted/squared/exptilted/expsquared
LOSS = args.get("LOSS", "tilted")
TILTED_Q = args.get("TILTED_Q", 0.5) # relevant only for tilted and exptilted

In [8]:
# regularization
LAMBDA = args.get("LAMBDA", 1)
REGULARIZATION = args.get("REGULARIZATION", "boot1").lower() # const/boot1

In [9]:
OUTPUT_NO = args.get("OUTPUT_NO", 3)

In [10]:
print("CONFIGURATION SUMMARY: %s" % dict2str(globals()))

CONFIGURATION SUMMARY: HMC_NITER=0 LAMBDA=1 LOSS=tilted OUTPUT_NO=3 REGULARIZATION=boot1 SEED=5 TILTED_Q=0.5 VI_NITER=1000000


In [11]:
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f28fbffe550>

### Losses

In [12]:
loss, optimal_h_bayes_estimator = losses.LossFactory(**globals()).create(LOSS)
print2("> <%s> loss: %s with (analytical/Bayes estimator) h: %s" % 
        (LOSS, loss.__name__, optimal_h_bayes_estimator.__name__))

[LossFactory] Configuration: TILTED_Q=0.5 LINEX_C=None
> <tilted> loss: tilted_loss_fixedq with (analytical/Bayes estimator) h: tilted_optimal_h_fixedq


In [13]:
empirical_risk = lambda preds, y: loss(preds, totorch(y)).mean()

### Data

In [14]:
X, Y, = regression_data.load_corn()
Y = Y[:, OUTPUT_NO]

In [15]:
TRAIN_MASK = np.array([i%2!=0 for i in range(X.shape[0])], dtype=bool)
X_TRAIN, Y_TRAIN = X[TRAIN_MASK,:], Y[TRAIN_MASK]
X_TEST, Y_TEST = X[~TRAIN_MASK,:], Y[~TRAIN_MASK]

X_TRAIN.shape, X_TEST.shape

((40, 700), (40, 700))

### Model preparation

In [16]:
pickle_path = "horseshoe_regression_model.pkl"
try:
    print("Loading model from %s" % pickle_path)
    sm = pickle.load(open(pickle_path, "rb"))
except:
    print("Failed. Recompiling model")
    start = time.time()
    sm = pystan.StanModel(model_code=horseshoe_regression.CODE)
    print("Compilation time:", time.time()-start)
    pickle.dump(sm, open(pickle_path, "wb"))

Loading model from horseshoe_regression_model.pkl
Failed. Recompiling model


INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ac3e4982d09648ff2d5c2db32402b71b NOW.


Compilation time: 68.19340753555298


In [17]:
stan_data = horseshoe_regression.create_data(X_TRAIN, Y_TRAIN)

### VI Inference

In [18]:
path = "horseshoe_regression_vi_fit_%s_%s.pkl" % (SEED,VI_NITER)

try:
    print("Loading VI fit from %s" % path)
    vi_fit = pickle.load(open(path, "rb"))
except:
    print("Failed. Running new inference.")
    start = time.time()
    vi_fit = sm.vb(data=stan_data, iter=VI_NITER, output_samples=1e3,
                   tol_rel_obj=0.001, eta=0.1, seed=SEED, verbose=True)
    print("VI time:", time.time()-start)
    pickle.dump(vi_fit, open(path, "wb"))

Loading VI fit from horseshoe_regression_vi_fit_5_1000000.pkl
Failed. Running new inference.




VI time: 887.9219462871552


In [19]:
# # PSIS-LOO Evaluation
failed_ks = 0.0
# import psis
# paramname2ix = dict((paramname, ix) for ix, paramname in enumerate(vi_fit["sampler_param_names"]))
# s = lambda paramname: vi_fit["sampler_params"][paramname2ix[paramname]] # access samples          
# loglik = np.array([s("loglik[%i]" % i) for i in range(1, X_TRAIN.shape[0]+1)]).T

# loo, loos, ks = psis.psisloo(loglik)
# failed_ks = sum(ks > 0.7) / len(ks)
# print("Fraction of failed k: %.3f" % failed_ks)

In [20]:
ys_train = totorch(horseshoe_regression.sample_predictive_y0_vi(vi_fit, X_TRAIN))
ys_test = totorch(horseshoe_regression.sample_predictive_y0_vi(vi_fit, X_TEST))

In [21]:
hstar_train = optimal_h_bayes_estimator(ys_train)
hstar_test = optimal_h_bayes_estimator(ys_test)

In [22]:
vi_train_risk = empirical_risk(hstar_train, Y_TRAIN).item()
vi_test_risk = empirical_risk(hstar_test, Y_TEST).item()
print("VI optimal risk: train:%.3f test:%.4f" % (vi_train_risk, vi_test_risk))

VI optimal risk: train:0.327 test:0.3408


### HMC Inference

In [23]:
if HMC_NITER<=0:
    
    print("Skipping HMC evaluation")
    hmc_train_risk, hmc_test_risk = 0.0, 0.0
    
else:
    
    hmc_fit = sm.sampling(data=stan_data, iter=HMC_NITER, n_jobs=1)

    ys_train_hmc = totorch(horseshoe_regression.sample_predictive_y0_hmc(hmc_fit, X_TRAIN))
    ys_test_hmc = totorch(horseshoe_regression.sample_predictive_y0_hmc(hmc_fit, X_TEST))

    hstar_train_hmc = optimal_h_bayes_estimator(ys_train_hmc)
    hstar_test_hmc = optimal_h_bayes_estimator(ys_test_hmc)

    hmc_train_risk = empirical_risk(hstar_train_hmc, Y_TRAIN).item()
    hmc_test_risk = empirical_risk(hstar_test_hmc, Y_TEST).item()

print("HMC optimal risk: train:%.4f test:%.4f" % (hmc_train_risk, hmc_test_risk))

Skipping HMC evaluation
HMC optimal risk: train:0.0000 test:0.0000


### Regularization

In [None]:
import regression_regularization as regularization

HSTAR_TRAIN = hstar_train # default regularization mean

print("[regularization] Using regularization: %s with lambda=%s" % (REGULARIZATION, LAMBDA))
if REGULARIZATION.startswith("boot"):     
    print("[regularization] Bootstrap-based regularization")    
    stan_data_generator = horseshoe_regression.yield_data_bootstrap(X_TRAIN, Y_TRAIN)    
    sample_predictive_y0_train = lambda vi_fit: horseshoe_regression.sample_predictive_y0_vi(vi_fit, X_TRAIN)
    sample_predictive_y0_test = lambda vi_fit: horseshoe_regression.sample_predictive_y0_vi(vi_fit, X_TEST)
    optimal_h_bayes_estimator_np = lambda ys: tonumpy(optimal_h_bayes_estimator(totorch(ys)))

    hstar1_train, _ = regularization.get_bootstrap_decisions(sm, stan_data_generator, 
                                        sample_predictive_y0_train, sample_predictive_y0_test, 
                                        optimal_h_bayes_estimator_np, num_repetitions=5, iter=VI_NITER)        
    LAMBDA_TRAIN = LAMBDA * 1.0/(2. * hstar1_train.std(0)**2)
        
    if REGULARIZATION.startswith("boot1"):
        print("[regularization] Bootstrap-based regularization: overwritting regularization mean")
        HSTAR_TRAIN = totorch( hstar1_train.mean(0) )
        
elif REGULARIZATION.startswith("const"): 
    LAMBDA_TRAIN = regularization.get_regularization_constant(ys_train, LAMBDA)
elif REGULARIZATION.startswith("std"): 
    LAMBDA_TRAIN = regularization.get_regularization_std(ys_train, LAMBDA)
elif REGULARIZATION.startswith("qdiff"): 
    LAMBDA_TRAIN = regularization.get_regularization_qdiff(ys_train, LAMBDA)
else:    
    raise Exception("[regularization] Unknown regularization method name: %s" % method)    

LAMBDA_TRAIN = totorch(LAMBDA_TRAIN)

[regularization] Using regularization: boot1 with lambda=1
[regularization] Bootstrap-based regularization
[get_bootstrap_decisions] 0/5




### Quantile optimization

In [None]:
def training_empirical_risk(q):    
    q = max(0., min(1., q[0]))
    h = losses.tilted_optimal_h(ys_train, q) # obtaing a quantile in a very convoluted way :)
    risk = empirical_risk(h, Y_TRAIN).item() 
    regularizer = ( LAMBDA_TRAIN * (h-HSTAR_TRAIN)**2 ).mean()
    obj = risk + regularizer
    print("evaluating training risk @ q=%.2f => risk=%.3f => obj=%.3f" % (q, risk, obj))
    return obj                      

In [None]:
from scipy.optimize import minimize
res = minimize(training_empirical_risk, [TILTED_Q], 
               method='Nelder-Mead', options={'xtol': 1e-5, 'disp': True, "maxiter": 100})

In [None]:
q = max(0., min(1., res["x"][0]))
q_train_risk = empirical_risk(losses.tilted_optimal_h(ys_train, q), Y_TRAIN).item() 
q_test_risk = empirical_risk(losses.tilted_optimal_h(ys_test, q), Y_TEST).item() 
print("Quantile optimal risk: train:%.3f test:%.4f" % (q_train_risk, q_test_risk))

### Decision maker

In [None]:
# Prepare features 
Quantiles = np.array([np.percentile(ys_train, int(q*100), axis=0) for q in np.arange(0., 1.01, 0.05)])
X_train = Quantiles

Quantiles = np.array([np.percentile(ys_test, int(q*100), axis=0) for q in np.arange(0., 1.01, 0.05)])
X_test = Quantiles

X_train, X_test = totorch(X_train.T), totorch(X_test.T)
X_test.shape, X_train.shape

In [None]:
torch.manual_seed(SEED)
NUM_RESTARTS = 10
best_decision_maker, best_risk = None, float("inf") 
for _ in range(NUM_RESTARTS):
    print("RESTARTING")
    
    decision_maker = nn.Sequential(
      nn.Linear(X_train.shape[1], 20),
      nn.ReLU(),
      nn.Linear(20, 10),
      nn.ReLU(),
      nn.Linear(10, 1)
    )

    optimizer = torch.optim.Adam(decision_maker.parameters(), lr=0.01)
    start = time.time()
    for i in range(20000):
        h = decision_maker(X_train).view(-1)
        train_loss = empirical_risk(h, Y_TRAIN)  
        regularizer = ( LAMBDA_TRAIN * (h-HSTAR_TRAIN)**2 ).mean()

        optimizer.zero_grad()
        (train_loss+regularizer).backward()
        optimizer.step()

        if i%1000==0:
            print("[%.2fs] %i. iteration: training batch risk: %.3f regularizer: %.3f train risk: %.3f test risk: %.3f" % 
              (time.time()-start, i, train_loss.item(), regularizer.item(),
               empirical_risk(decision_maker(X_train).view(-1), Y_TRAIN).item(),
               empirical_risk(decision_maker(X_test).view(-1), Y_TEST).item()))
            
    dm_train_risk = empirical_risk(decision_maker(X_train).view(-1), Y_TRAIN).item()           
    if dm_train_risk<best_risk: best_decision_maker, best_risk = decision_maker, dm_train_risk
decision_maker = best_decision_maker                    

In [None]:
dm_train_risk = empirical_risk(decision_maker(X_train).view(-1), Y_TRAIN).item()
dm_test_risk = empirical_risk(decision_maker(X_test).view(-1), Y_TEST).item()
print("DM optimal risk: train:%.3f test:%.4f" % (dm_train_risk, dm_test_risk))

### Store evaluation results

In [None]:
results = [[SEED, LOSS, TILTED_Q, REGULARIZATION, LAMBDA, VI_NITER, HMC_NITER, OUTPUT_NO,
            vi_train_risk, vi_test_risk, 
            q_train_risk, q_test_risk, 
            dm_train_risk, dm_test_risk,
            hmc_train_risk, hmc_test_risk, 
            failed_ks]]

In [None]:
path = "horseshore_regression_%i_%s_%s_%s_%s_%s_%s.csv" % \
            (SEED, LOSS, TILTED_Q, REGULARIZATION, LAMBDA, VI_NITER, OUTPUT_NO)
print("Saving to %s" % path)
pd.DataFrame(results).to_csv(path, header=False, index=False)