In [None]:
import numpy as np
import pandas as pd
import scipy.stats as spstats
from scipy import signal
import pickle
from multiprocessing import Pool
import multiprocessing
import scipy.sparse as sparse
from sklearn.model_selection import train_test_split

from potentials import potentialRegression
from baselines import construct_ESVM_kernel,split_dataset,set_function,standartize
from optimize import Run_eval_test,optimize_parallel_new
from samplers import MCMC_sampler,Generate_train
from utils import *

## Setting parameters

<p>Datasets:</p>
<ol>
    <li>Eeg $(N=14\,980,\ d=15)$</li>
    <li>Susy $(N=500\,000,\ d=19)$</li>
</ol>

In [None]:
dataset = "eeg" # Switch between "eeg" and "susy" 

intercept = True # Do we include the intercept

CV2 = False # Do we include second-order control variates

method = {"sampler":"ULA","burn_type":"SGLD","main_type":"SAGA"} # Sampling method
#during burn-in period we use simple SGLD to prevent SAGA stacking at local minima

# Switch between "posterior_prob_point", "posterior_prob_mean", "posterior_prob_variance", "posterior_mean"
f_type = "posterior_prob_mean"

In [None]:
# Tuning parameters
step = 1*10**(-1)
n_traj_train = 5 # Number of independent MCMC trajectories for train
n_traj_test = 100 # Number of independent MCMC trajectories for test

if (dataset == "eeg"):    
    batch_size = 1*15 # Batch size for stochastic gradient
    N_b = 5*10**3 # Burn-in period
    N_train = 1*10**4 # Length of the train trajectory
    N_test = 1*10**5 # Length of the test trajectories
elif (dataset == "susy"): 
    batch_size =3*19 # Batch size for stochastic gradient
    N_b = 5*10**4 # Burn in period
    N_train = 1*10**5 # Number of samples on which we optimize
    N_test = 1*10**6 # Number of samples

## Loading data

In [None]:
if (dataset == "eeg"):   
    data = pd.read_csv("data/eeg.csv",header=None)
    outliers_inds = np.array([13179,11509,898,10386])
    Y = data.iloc[:,-1]
    X = data.iloc[:,:-1]    
elif (dataset == "susy"): 
    data = pd.read_csv("data/susy.csv",header=None)
    outliers_inds = np.array([267630])
    Y = data.iloc[:,0]
    X = data.iloc[:,1:]

## Preprocessing data

In [None]:
# Removing the outliers
if (outliers_inds.size!=0):
    X_processed = np.delete(np.asarray(X),outliers_inds,0)
    mask = np.ones(len(Y),dtype = bool)
    mask[outliers_inds] = False
    Y_processed = Y[mask]
    Y_processed = np.asarray(Y_processed)
    X_processed = np.asarray(X_processed)
else:
    Y_processed = np.asarray(Y)
    X_processed = np.asarray(X)

In [None]:
if (f_type == "posterior_mean"):
    X_train,X_train = standartize(X_processed,X_processed,intercept=intercept)
    Y_train = Y_processed
else:
    X_train,X_test,Y_train,Y_test = train_test_split(X_processed,Y_processed,test_size=100,random_state=1812,stratify=Y_processed)
    X_train,X_test = standartize(X_train,X_test,intercept=intercept)

## Creating potential

In [None]:
optim_params = {
    "compute_fp":False,
    "GD":False,
    "stochastic":False,
    "order":1,
    "n_restarts":5,
    "batch_size":100,
    "sigma":1.0,
    "gtol":1e-6,
    "gamma":5e-4,
    "weight_decay":0.995,
    "loop_length":100,
    "n_loops":300
}

In [None]:
# Construct kernel
W_train_spec = construct_ESVM_kernel(N_train)
W_test_spec = construct_ESVM_kernel(N_test)

In [None]:
# Creating potential
Cur_pot = potentialRegression(Y_train, X_train, typ = "l",optim_params = optim_params, batch_size = batch_size, print_info = True)
d = Cur_pot.d #dimension

## Sampling training trajectories

In [None]:
res = Generate_train(n_traj_train, method, Cur_pot, step, N_b, N_train, d)
res = np.asarray(res)
traj,traj_grad = res[:,0,:,:],res[:,1,:,:]

## Initialization of function values

If the function you are willing to evaluate is "posterior_prob_point" or "posterior mean" — pass through inds_arr parameter indices of variables, over which you are willing to optimize. For example, in case of "posterior_prob_point",

>inds_arr = np.array([0])

means that you are willing to reduce variance for a point from the test dataset with index $0$.

In [None]:
if (f_type == "posterior_mean"):
    inds_arr = np.array([1]) # Taking the second index (not intercept)
    params = None
else:
    params = {"X":X_test,"Y":Y_test}
    inds_arr = np.array([0])
    
f_vals = set_function(f_type,traj,inds_arr,params)

## Training coefficients for EVM and ESVM

In [None]:
n_restarts = 2 # Number of restarts during optimization
sigma = 1 # Deviation of starting points
tol = 1e-5 # Tolerance (for the norm of gradient)

In [None]:
A_ESVM_1,A_EVM_1,A_LS_1 = optimize_parallel_new(1,inds_arr,f_vals,traj,traj_grad,W_train_spec,n_restarts,tol,sigma)
if CV2:
    A_ESVM_2,A_EVM_2,A_LS_2 = optimize_parallel_new(2,inds_arr,f_vals,traj,traj_grad,W_train_spec,n_restarts,tol,sigma)
else:
    A_ESVM_2,A_EVM_2,A_LS_2 = np.zeros((2,d**2+d)),np.zeros((2,d**2+d)),np.zeros((2,d**2+d))

## Coefficients for control variates

In [None]:
print("Coefficients for ESVM")
print(A_ESVM_1)
print("Coefficients for EVM")
print(A_EVM_1)
print("Coefficients for LS")
print(A_LS_1)

In [None]:
if CV2:
    print("Coefficients for ESVM")
    print(A_ESVM_2)
    print("Coefficients for EVM")
    print(A_EVM_2)
    print("Coefficients for LS")
    print(A_LS_2)

## Testing EVM and ESVM

In [None]:
# Create a dictionary, put respective matrices into it
CV_dict = {"ESVM":[A_ESVM_1,A_ESVM_2],"EVM":[A_EVM_1,A_EVM_2],"LS":[A_LS_1,A_LS_2]}
# Number of cores exploited for the computation of the independent trajectories
# by deault, all available cores on the machine
nbcores = multiprocessing.cpu_count()
trav = Pool(nbcores)
res = trav.starmap(Run_eval_test, [(i,method,inds_arr,Cur_pot,W_test_spec,CV_dict,step,N_b,N_test,d,params,f_type) for i in range (n_traj_test)])
trav.close()
res_arr = np.asarray(res) # Saving results as np.array

## Results

In [None]:
print("Estimators")
print("SGLDFP {}".format(np.mean(res_arr[:,0,0,:],axis=0)))
print("ESVM pol=1 {}".format(np.mean(res_arr[:,0,1,:],axis=0)))
#print("ESVM pol=2 {}".format(np.mean(res_arr[:,0,2,:],axis=0)))
print("EVM pol=1 {}".format(np.mean(res_arr[:,0,3,:],axis=0)))
#print("EVM pol=2 {}".format(np.mean(res_arr[:,0,4,:],axis=0)))
print("LS pol=1 {}".format(np.mean(res_arr[:,0,3,:],axis=0)))
#print("LS pol=2 {}".format(np.mean(res_arr[:,0,4,:],axis=0)))

In [None]:
print("Variances")
print("Vanilla MC {}".format(np.mean(res_arr[:,1,0,:],axis=0)))
print("ZAV pol=1 {}".format(np.mean(res_arr[:,1,1,:],axis=0)))
#print("ZAV pol=2 {}".format(np.mean(res_arr[:,1,2,:],axis=0)))
print("ZV pol=1 {}".format(np.mean(res_arr[:,1,3,:],axis=0)))
#print("ZV pol=2 {}".format(np.mean(res_arr[:,1,4,:],axis=0)))
print("LS pol=1 {}".format(np.mean(res_arr[:,1,5,:],axis=0)))
#print("LS pol=2 {}".format(np.mean(res_arr[:,1,6,:],axis=0)))

## Plotting the results

In [None]:
var_ind = 0 # Index to plot
title = dataset.upper()+" dataset"
labels = ['Vanilla\n SAGA-LD', 'SAGA-LD \nwith EVM','SAGA-LD \nwith ESVM']

In [None]:
# Violin plots
if CV2: 
    data1 = [res_arr[:,0,0,var_ind],res_arr[:,0,3,var_ind],res_arr[:,0,1,var_ind]]
    data2 = [res_arr[:,0,0,var_ind],res_arr[:,0,4,var_ind],res_arr[:,0,2,var_ind]]
    violplot_2ind(data1, data2, title, labels)
else: 
    data = [res_arr[:,0,0,var_ind],res_arr[:,0,3,var_ind],res_arr[:,0,1,var_ind]] 
    violplot_ind(data, title, labels)

In [None]:
# Box plot
if CV2: 
    data1 = [res_arr[:,0,0,var_ind],res_arr[:,0,3,var_ind],res_arr[:,0,1,var_ind]]
    data2 = [res_arr[:,0,0,var_ind],res_arr[:,0,4,var_ind],res_arr[:,0,2,var_ind]]
    boxplot_2ind(data1, data2, title, labels)
else: 
    data = [res_arr[:,0,0,var_ind],res_arr[:,0,3,var_ind],res_arr[:,0,1,var_ind]] 
    boxplot_ind(data, title, labels)