In [None]:
# standard packages
import numpy as np
import scipy.stats as sps

from sklearn.mixture import BayesianGaussianMixture

import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
%load_ext autoreload
%autoreload 2
# custom packages
import supplemental_funcs as sf
import example_master as EM

# Expensive Computations

The purpose of this notebook is to do expensive computations and then save numpy arrays that can be easily loaded by other notebooks without having to do the computations a bunch of times.

In [None]:
# this is the dictionary of all the numpy arrays that we want to save
# each element should be a numpy array
big_numpy_dict = {}

# save all list
# list of (filename, [keys]) to be saved
save_all_list = []

In [None]:
# save file helper
def save_np_dict(filename,this_dict,elements):
    save_list = {element: this_dict[element] for element in elements}
    np.savez(filename,**save_list)

## KDE Expensive Computations

### Bootstrap KDE Errors

Note that I am a little lazy here. Instead of generating a bunch of new samples to compute the MISE, I compute the MISE on bootstraps of the original samples for different samples sizes.

These leads to convergence plots which are slightly biased with lower variance. In other words, the convergence is likely slightly faster and less variable than is probably realistic.

These plots could be made more accurately with new samples, but since I want to re-use bootstrap samples for constructing the CI, doing so would require more computation time.

In [None]:
# load parameters from the example master
B_n = 100
sample = EM.tri_peak_sample
total_n = EM.tri_peak_n
N_list = EM.tri_peak_N_list
obs_exact_dist = EM.tri_peak_mixture

# generate a set of bootstrapped samples
Bootstrapped_Samples = np.random.choice(sample,size=[B_n,total_n])

In [None]:
# verify that the samples a few boot strap samples are reasonable
plt.hist(Bootstrapped_Samples[80],edgecolor='k',density=True)

Compute the convergence with three different bandwidth parameters $h$ for the GKDE:

* **Scott's rule:** $h=\hat{\sigma}n^{-1/5}$

* **HMISE rule:** $h_{MISE}\approx \left(2.09122^{-1/5}\right)n^{-1/5}$ (multiplicative factor computed in [this notebook](Review%20of%20Density%20Estimation.ipynb))

* **1/2 IQR:** $h_{IQRD}\approx \left(\frac{IQR}{2}\right)n^{-1/5}$ (justification and reasoning provided in  [this notebook](DCI%20and%20Density%20Estimation.ipynb))

In [None]:
# creates a dictionary of kdes
# for each sample index N, there are B kdes corresponding 
# to each bootstrapped sample

# approximate value loaded from example master
R_factor = EM.KDE_MISE_factor

obs_kde_list = {'Scott':{}, 'HMISE': {}, 'hIQRD': {}}
for N in N_list:
    obs_kde_list['Scott'][N] = []
    obs_kde_list['HMISE'][N] = []
    obs_kde_list['hIQRD'][N] = []
    
    this_hmise = (R_factor/N)**(1/5)
    for B_sample in Bootstrapped_Samples:
        obs_kde_list['Scott'][N].append(sps.gaussian_kde(B_sample[0:N]))
        kde_factor = this_hmise/np.std(B_sample[0:N],ddof=1) # divide to get correct h
        obs_kde_list['HMISE'][N].append(sps.gaussian_kde(B_sample[0:N],
                                                        bw_method=kde_factor))
        
        # compute with respect to hIQRD
        IQR_dev = sps.iqr(B_sample[0:N])/2
        this_hIQRD = IQR_dev*len(B_sample[0:N])**(-1/5)
        kde_factor = this_hIQRD/np.std(B_sample[0:N],ddof=1)
        obs_kde_list['hIQRD'][N].append(sps.gaussian_kde(B_sample[0:N],
                                                        bw_method=kde_factor))

        
        

In [None]:
Bootstrapped_Samples.shape

In [None]:
# compute the MISE for each bootstrapped sample
# limit_B = 5 # takes just a subset of samples
limit_B = Bootstrapped_Samples.shape[0] # takes all bootstrapped samples

obs_err_list = {'Scott': {}, 'HMISE': {}, 'hIQRD': {}}
for N in N_list:
    obs_err_list['Scott'][N] = []
    obs_err_list['HMISE'][N] = []
    obs_err_list['hIQRD'][N] = []
    
    for kde in obs_kde_list['Scott'][N][0:limit_B]:
        this_err,tol = sf.L2_err_1D(kde,obs_exact_dist,-10,25,quad_kwargs={'epsabs':1e-6})
        obs_err_list['Scott'][N].append(this_err)
        
    for kde in obs_kde_list['HMISE'][N][0:limit_B]:
        this_err, tol = sf.L2_err_1D(kde,obs_exact_dist,-10,25,quad_kwargs={'epsabs':1e-6})
        obs_err_list['HMISE'][N].append(this_err)
    
    for kde in obs_kde_list['hIQRD'][N][0:limit_B]:
        this_err, tol = sf.L2_err_1D(kde,obs_exact_dist,-10,25,quad_kwargs={'epsabs':1e-6})
        obs_err_list['hIQRD'][N].append(this_err)

In [None]:
# Check that results are reasonable
err_matrix = np.array([obs_err_list['Scott'][key] for key in obs_err_list['Scott']]).T
err_matrix2 = np.array([obs_err_list['HMISE'][key] for key in obs_err_list['HMISE']]).T
err_matrix_IQRD = np.array([obs_err_list['hIQRD'][key] for key in obs_err_list['hIQRD']]).T

plt.scatter(np.log(N_list*limit_B),np.log(err_matrix.reshape(-1,)))
plt.scatter(np.log(N_list*limit_B),np.log(err_matrix2.reshape(-1,)))
plt.scatter(np.log(N_list*limit_B),np.log(err_matrix_IQRD.reshape(-1,)))

In [None]:
err_matrix.shape

In [None]:
# save the KDE errors to dictionary
big_numpy_dict['ScottMISE'] = err_matrix
big_numpy_dict['OptimalMISE'] = err_matrix2
big_numpy_dict['hIQRD_MISE'] = err_matrix_IQRD

# add to save all list
this_filename = EM.tri_peak_MISE_name
save_all_list.append((this_filename,['ScottMISE','OptimalMISE','hIQRD_MISE']))

In [None]:
# # save just the MISE errors
# this_filename = EM.tri_peak_MISE_name
# save_np_dict(this_filename,big_numpy_dict,['ScottMISE',
#                                            'OptimalMISE',
#                                            'hIQRD_MISE'])

### Bootstrap Confidence Intervals

Here we use the bootrstrapped sample to compute confidence intervals for the GKDE of the tripeak density.

In [None]:
# pointwise confidence intervals at each of these points
this_qx = EM.tri_peak_qx

# eval points
CI_sample_N = EM.tri_peak_CI_sample_size

this_qy_vals = {'Scott':[],'HMISE':[],'hIQRD': []}
for key in obs_kde_list:
    for kde in obs_kde_list[key][CI_sample_N]:
        this_qy_vals[key].append(sf.eval_pdf(this_qx,kde))

In [None]:
# save numpy evals to dictionary
for key in this_qy_vals.keys():
    big_numpy_dict[key+'CI'] = np.array(this_qy_vals[key])

In [None]:
# # save just the CI Values: NOTE: We also want to save the CI for 
# # the update from the bootrstrap as well. This is defined later!
# this_filename = EM.tri_peak_CI_name
# save_np_dict(this_filename,big_numpy_dict,['ScottCI','HMISECI','hIQRD'])

## Expensive BGM and DPMM Computations

Here we compute the $L^2$ for a large sample of BGMM models and a DPMM model.

In [None]:
# get the key parameters from the example master
sample = EM.tri_peak_sample
total_n = EM.tri_peak_n
N_list = EM.tri_peak_N_list
obs_exact_dist = EM.tri_peak_mixture

# number of samples to fit BGMM
M = 100

# generate a set of bootstrapped samples
M_Samples = [obs_exact_dist.rvs(size=[M,N]) for N in N_list]

In [None]:
# get general arguments for EM model
BGM_general_arg = EM.tri_peak_BGMM_arg_dict
BGM_general_arg

In [None]:
# prior for distribution A
this_K = 5
arg_prior_dict_A = {'n_components': this_K,
                'weight_concentration_prior_type': 'dirichlet_distribution',
                'weight_concentration_prior': 1,
                'mean_prior': np.atleast_1d(np.round(np.mean(sample))),
                'mean_precision_prior': 1, # kappa
                'degrees_of_freedom_prior': 1, # nu
                'covariance_prior': np.atleast_2d(np.round(np.cov(sample))) # psi
                   }
arg_prior_dict_A

In [None]:
# prior for distribution C
# get a window size that produces a better fit
var_min = np.min([EM.distA.var(),EM.distB.var(),EM.distC.var()])
window = np.sqrt(var_min)
print(window)

# scale precision parameter so that variance of the means is 
# variance of the sample
this_kappa0 = window**2/np.cov(sample)
print(this_kappa0)

# define new prior with adjusted window
arg_prior_dict_C = arg_prior_dict_A.copy()
arg_prior_dict_C['covariance_prior'] = np.atleast_2d(window)
arg_prior_dict_C['mean_precision_prior'] = this_kappa0
print(arg_prior_dict_C)
print()

# define prior for DPMM
arg_prior_dict_DP_C = arg_prior_dict_C.copy()
arg_prior_dict_DP_C['n_components'] = 30
arg_prior_dict_DP_C['weight_concentration_prior_type'] = 'dirichlet_process'
print(arg_prior_dict_DP_C)

In [None]:
# creates a dictionary of BGM models
# for each sample index N, there is one BGMM model

obs_BGM_list = {'BGMM_A':{}, 'BGMM_C': {}, 'DPMM_C': {}}
for N in N_list:
    obs_BGM_list['BGMM_A'][N] = {'model': BayesianGaussianMixture(**arg_prior_dict_A,
                                            **BGM_general_arg)}
    obs_BGM_list['BGMM_C'][N] = {'model': BayesianGaussianMixture(**arg_prior_dict_C,
                                            **BGM_general_arg)}
    
    obs_BGM_list['DPMM_C'][N] = {'model': BayesianGaussianMixture(**arg_prior_dict_C,
                                            **BGM_general_arg)}
    

In [None]:
# generate a subset of samples
# limit_BGM = 2 # takes just a subset of samples
limit_BGM = 100 # make it similar to the bootstrap estimate

# fit all of the BGM models
for key in obs_BGM_list:
    for nj,N in enumerate(N_list):
        # this model and samples
        this_model = obs_BGM_list[key][N]['model']
        these_samples = M_Samples[nj]
        
        # dictionary to save parameters
        obs_BGM_list[key][N]['param_sample'] = {'weight': [],
                                                'mean': [],
                                                'cov': []}
        
        # for each of the M samples
        for sample in these_samples[0:limit_BGM]:
            # fit the model to this specific dateset
            this_model.fit(sample.reshape(-1,1))
#             print('{}, {}: '.format(key,N),obs_BGM_list[key][N]['model'].converged_)
            
            # save the weights, means and covariances
            obs_BGM_list[key][N]['param_sample']['weight'].append(np.squeeze(this_model.weights_))
            obs_BGM_list[key][N]['param_sample']['mean'].append(np.squeeze(this_model.means_))
            obs_BGM_list[key][N]['param_sample']['cov'].append(np.squeeze(this_model.covariances_))
        
        

In [None]:

# # for each of the BGMs generate limit_B random pdf samples
# for key in obs_BGM_list:
#     for N in N_list:
#         this_BGM = obs_BGM_list[key][N]['model']
#         this_sample_param = sf.Forward_BGM_Model(this_BGM).rvs(limit_BGM)
#         obs_BGM_list[key][N]['param_sample'] = this_sample_param


In [None]:
# get all the mixture pdfs and compute L2 error for each
BGM_err_list = {}
for key in obs_BGM_list:
    BGM_err_list[key] = {}
    this_err_array = np.empty([limit_BGM,len(N_list)])
    for nj,N in enumerate(N_list):
        # get the parameter weights and means
        this_weight = obs_BGM_list[key][N]['param_sample']['weight']
        this_mean = np.squeeze(obs_BGM_list[key][N]['param_sample']['mean'])
        this_var = obs_BGM_list[key][N]['param_sample']['cov']
        
        for ib, (mu,sig2,w) in enumerate(zip(this_mean,this_var,this_weight)):
            # get the mixture pdfs
            pdfs = [sps.norm(m,np.sqrt(s2)) for m,s2 in zip(mu,sig2)]
            this_mixture_dist = sf.mixture_dist(pdfs,w)
            
            # get the errors and save them
            this_err_array[ib,nj], tol = sf.L2_err_1D(obs_exact_dist,
                                              this_mixture_dist,-10,25)
    # save error array to dictionary
    BGM_err_list[key]['L2_err'] = this_err_array

In [None]:
# check errors make sense
err_matrix3 = BGM_err_list['BGMM_A']['L2_err']
err_matrix4 = BGM_err_list['BGMM_C']['L2_err']
err_matrix5 = BGM_err_list['DPMM_C']['L2_err']

plt.scatter(np.log(N_list*limit_BGM),np.log(err_matrix3.reshape(-1,1)),label='BGMM_A')
plt.scatter(np.log(N_list*limit_BGM),np.log(err_matrix4.reshape(-1,1)),label='BGMM_C')
plt.scatter(np.log(N_list*limit_BGM),np.log(err_matrix5.reshape(-1,1)),label='DPMM_C')
# plt.scatter(np.log(N_list*5),np.log(err_matrix.reshape(-1,)),label='ScottKDE')
# plt.scatter(np.log(N_list*5),np.log(err_matrix2.reshape(-1,)),label='HMISEkde')
plt.legend()

In [None]:
# save the BGM errors to file
big_numpy_dict['BGMM_A_L2_err'] = BGM_err_list['BGMM_A']['L2_err']
big_numpy_dict['BGMM_C_L2_err'] = BGM_err_list['BGMM_C']['L2_err']
big_numpy_dict['DPMM_C_L2_err'] = BGM_err_list['DPMM_C']['L2_err']

# add to save all list
this_filename = EM.tri_peak_BGM_name
save_all_list.append((this_filename,['BGMM_A_L2_err','BGMM_C_L2_err','DPMM_C_L2_err']))

In [None]:
# # save just the CI Values
# this_filename = EM.tri_peak_BGM_name
# save_np_dict(this_filename,big_numpy_dict,['BGMM_A_L2_err',
#                                            'BGMM_C_L2_err',
#                                            'DPMM_C_L2_err'])

# Data-Consistent Update MSE

In this section, we do expensive computations for the update. We use some of the density estimations of the observed distribution from the previous section here to do the analysis of the update.

In [None]:
# load the Qmap
Q_map = EM.Q_nonlinear_1D_to_1D

# setup initial, predicted density, and observed density
init_dist = sps.beta(a=1,b=1.5,scale=10)
predict_sample = Q_map(init_dist.rvs(5000))
predict_kde = sps.gaussian_kde(predict_sample)
obs_dist = EM.tri_peak_mixture

# save the appropriate domains
lamx = EM.tri_peak_lamx
qx = EM.tri_peak_qx

# define exact update
exact_update = sf.dci_update(init_dist,predict_kde,obs_dist,Q_map)

In [None]:
# quick check distributions are correct
fig_exact_update, (axL,axD) = plt.subplots(1,2)
fig_exact_update.set_figwidth(9)

# parameter space
axL.plot(lamx,init_dist.pdf(lamx),ls='--',color='gray',
         alpha=0.7,label='Initial')
axL.plot(lamx,exact_update.pdf(lamx),label='Update')


# data space
axD.plot(qx,predict_kde.pdf(qx),ls='--',color='gray',label='Predicted')
axD.plot(qx,obs_dist.pdf(qx),label='Observed')

# typical labels
axL.legend()
axL.set_title('Parameter Space $\Lambda$')
axL.set_xlabel('$\lambda$')
axD.legend()
axD.set_title('Data Space $\mathcal{D}$')
axD.set_xlabel('$q$')

Here we compute the $L1$ and $L2$ errors for the update using the bootstrapped samples of KDE.

In [None]:
# compute the MISE for each bootstrapped sample
# limit_B = 5 # takes just a subset of samples
limit_B = Bootstrapped_Samples.shape[0] # takes all bootstrapped samples

update_err_list = {'L1': {}, 'L2': {}}
for N in N_list:
    update_err_list['L1'][N] = []
    update_err_list['L2'][N] = []
    
    for kde in obs_kde_list['hIQRD'][N][0:limit_B]:
        # define the updated distribution for this KDE
        this_update = sf.dci_update(init_dist,predict_kde,kde,Q_map)
        
        # compute the L2 error for the update
        this_err_L2, tol = sf.L2_err_1D(this_update,exact_update,0.01,10,quad_kwargs={'epsabs':1e-6})
        update_err_list['L2'][N].append(this_err_L2)
        
        # compute the L1 error for the update
        this_err_L1, tol = sf.L1_err_1D(this_update,exact_update,0.01,10,quad_kwargs={'epsabs':1e-6})
        update_err_list['L1'][N].append(this_err_L1)

In [None]:
# Check that results are reasonable
lam_err_matrix = np.array([update_err_list['L1'][key] for key in update_err_list['L1']]).T
lam_err_matrix2 = np.array([update_err_list['L2'][key] for key in update_err_list['L2']]).T

# plt.scatter(np.log(N_list*limit_B),np.log(lam_err_matrix.reshape(-1,)))
plt.scatter(np.log(N_list*limit_B),np.log(lam_err_matrix2.reshape(-1,)))
plt.scatter(np.log(N_list*limit_B),np.log(err_matrix_IQRD.reshape(-1,)))


In [None]:
# save the KDE errors to dictionary
big_numpy_dict['updateMISE_L1'] = lam_err_matrix
big_numpy_dict['updateMISE_L2'] = lam_err_matrix2

# add to save all list
this_filename = EM.tri_peak_update_MISE_name
save_all_list.append((this_filename,['updateMISE_L1','updateMISE_L2']))

In [None]:
# # save just the MISE errors
# this_filename = EM.tri_peak_update_MISE_name
# save_np_dict(this_filename,big_numpy_dict,['updateMISE_L1',
#                                            'updateMISE_L2'])

Now compute MSE and L1 for DPMM.

In [None]:
arg_dict = EM.tri_peak_BGMM_arg_dict
this_K = 15

arg_prior_dict_DPMM_UP = {'n_components': this_K,
                'weight_concentration_prior_type': 'dirichlet_process',
                'weight_concentration_prior': 1,
                'mean_prior': np.atleast_1d(1),
                'mean_precision_prior': 1, # kappa
                'degrees_of_freedom_prior': 1, # nu
                'covariance_prior': np.atleast_2d(np.round(IQR_dev**2)) # psi
                   }

print(arg_prior_dict_DPMM_UP)

In [None]:
# compute the MISE for each bootstrapped sample
print(limit_B) # takes just a subset of samples
# limit_B = Bootstrapped_Samples.shape[0] # takes all bootstrapped samples

DPMM_update_err_list = {'L1q': {}, 'L2q': {}, 'L2lam': {}}
for N in N_list:
    DPMM_update_err_list['L1q'][N] = []
    DPMM_update_err_list['L2q'][N] = []
    DPMM_update_err_list['L2lam'][N] = []
    
    for B_sample in Bootstrapped_Samples[0:limit_B]:
        # update the covariance prior
        this_sample = B_sample[0:N]
        IQR_dev = sps.iqr(this_sample)/2
        arg_prior_dict_DPMM_UP['covariance_prior'] = np.atleast_2d(np.round(IQR_dev**2))
        
        # define and fit the model
        this_DPMM = BayesianGaussianMixture(**arg_prior_dict_DPMM_UP,**arg_dict)
        this_DPMM.fit(this_sample.reshape(-1,1))
        
        # compute error in qspace
        this_DPMM_Forward = sf.Forward_BGM_Model(this_DPMM)
        this_err_L1, tol = sf.L1_err_1D(this_DPMM_Forward,obs_dist,-10,25,
                                           quad_kwargs={'epsabs':1e-6})
        this_err_L2, tol = sf.L2_err_1D(this_DPMM_Forward,obs_dist,-10,25,
                                           quad_kwargs={'epsabs':1e-6})
        # save data space errors
        DPMM_update_err_list['L1q'][N].append(this_err_L1)
        DPMM_update_err_list['L2q'][N].append(this_err_L2)
        
        
        # compute the L2 error for the update
        this_update = sf.dci_update(init_dist,predict_kde,this_DPMM,Q_map)
        this_up_err_L2, tol = sf.L2_err_1D(this_update,exact_update,0.01,10,quad_kwargs={'epsabs':1e-6})
        DPMM_update_err_list['L2lam'][N].append(this_up_err_L2)
        
#         # compute the L1 error for the update
#         this_err_L1, tol = sf.L1_err_1D(this_update,exact_update,0.01,10,quad_kwargs={'epsabs':1e-6})
#         update_err_list['L1q'][N].append(this_err_L1)
        
#         # define the updated distribution for this KDE
#         this_update = sf.dci_update(init_dist,predict_kde,kde,Q_map)
        
        

In [None]:
# DPMM_update_err_list['L1q']

DPMM_err_matrix1 = np.array([DPMM_update_err_list['L1q'][key] for key in DPMM_update_err_list['L1q']]).T
DPMM_err_matrix2 = np.array([DPMM_update_err_list['L2q'][key] for key in DPMM_update_err_list['L2q']]).T
DPMM_err_matrix3 = np.array([DPMM_update_err_list['L2lam'][key] for key in DPMM_update_err_list['L2lam']]).T

In [None]:
plt.scatter(np.log(N_list*limit_B),np.log(DPMM_err_matrix1.reshape(-1,)))
plt.scatter(np.log(N_list*limit_B),np.log(DPMM_err_matrix2.reshape(-1,)))
plt.scatter(np.log(N_list*limit_B),np.log(DPMM_err_matrix3.reshape(-1,)))

# plt.scatter(np.log(N_list*limit_B),np.log(err_matrix.reshape(-1,)))
plt.scatter(np.log(N_list*limit_B),np.log(err_matrix_IQRD.reshape(-1,)))



In [None]:
# save the KDE errors to dictionary
big_numpy_dict['DPMM_err_L1'] = DPMM_err_matrix1
big_numpy_dict['DPMM_err_L2'] = DPMM_err_matrix2
big_numpy_dict['DPMM_up_err_L2'] = DPMM_err_matrix3

# add to save all list
this_filename = EM.tri_peak_DPMM_name
save_all_list.append((this_filename,['DPMM_err_L1','DPMM_err_L2','DPMM_up_err_L2']))

In [None]:
# # save just the MISE errors
# this_filename = EM.tri_peak_DPMM_name
# save_np_dict(this_filename,big_numpy_dict,['DPMM_err_L1',
#                                            'DPMM_err_L2',
#                                            'DPMM_up_err_L2'])

### Bootstrap Confidence Intervals for the Update

Use the Bootrstrapped KDES to compute the bootstrapped CI for the update.

In [None]:
# pointwise confidence intervals at each of these points
this_lamx = EM.tri_peak_lamx
this_qx = EM.tri_peak_qx

# eval points
CI_sample_N = EM.tri_peak_CI_sample_size

this_lamy_vals = {'hIQRD': []}
for key in this_lamy_vals:
    for kde in obs_kde_list[key][CI_sample_N]:
        this_update = sf.dci_update(init_dist,predict_kde,kde,Q_map)
        this_lamy_vals[key].append(sf.eval_pdf(this_lamx,this_update))

In [None]:
# save numpy evals to dictionary
big_numpy_dict['updateCI'] = np.array(this_lamy_vals['hIQRD'])

In [None]:
# check to make sure all the keys are in the dict
for key in ['ScottCI','HMISECI','hIQRDCI','updateCI']:
    print(big_numpy_dict[key].shape)

In [None]:
# add to save all list
this_filename = EM.tri_peak_CI_name
save_all_list.append((this_filename,['ScottCI','HMISECI','hIQRDCI','updateCI']))

In [None]:
# # save ALL the CI Values
# this_filename = EM.tri_peak_CI_name
# save_np_dict(this_filename,big_numpy_dict,['ScottCI','HMISECI',
#                                            'hIQRDCI','updateCI'])

## Big Save of Numpy Arrays!

Better to use individual saves, but may be used to save all the numpy arrays at once.

In [None]:
# check the save all list before saving
save_all_list

In [None]:
# for filename,these_keys in save_all_list:
#     save_np_dict(filename,big_numpy_dict,these_keys)