# R68 MCMC_faster

Use MCMC to estimate yield model parameters for R68 data.
This is a redesigned version of the R68_MCMC notebook.
It is an effort to streamline the calculations and keep better track of settings used in each mcmc fit.

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#Set up notebook and load some R68 constants (V, eps, etc.)
exec(open("nb_setup.py").read())#Is there a better way to do this?
from constants import *

Construct a dictionary to store all the MCMC fit parameters and results

In [3]:
mcmc_data={'g4_load_frac':0.1,
          'cap_load_frac':0.1,
          'cap_sim_file':'/data/chocula/villaa/cascadeSimData/si28_R68_400k.pkl',
          'cap_rcapture':0.161,
          'Emax':None,
          'Ebins':None,
           'Ymodel':'Lind',
           'likelihood':'Pois'
          }

In [3]:
#Load the datasets
import R68_load as r68

meas=r68.load_measured()
g4=r68.load_G4(load_frac=mcmc_data['g4_load_frac'])
cap=r68.load_simcap(file=mcmc_data['cap_sim_file'], rcapture=mcmc_data['cap_rcapture'], load_frac=mcmc_data['cap_load_frac'])

#Import yield models
import R68_yield as Yield
import R68_spec_tools as spec

Y=Yield.Yield('Lind',[0.15])
print(Y.models)

Loading Measured Data...
(480634,)
(174510,)
Loading Geant4 Data...
(528848, 7)
(129555, 7)
Loading NRs...
1.1  min
Loading ERs...
0.3  min
Loading (n,gamma) Data...
400000


In [55]:
#Define likelihood functions
from scipy.special import factorial, gamma, loggamma

#Poisson likelihood of measuring k given expected mean of lambda
def pois_likelihood(k, lamb):
    return (lamb**k)*np.exp(-lamb)/gamma(k+1.)

#Poisson log-likelihood
#k: observed counts
#lamb: expected (model) counts
def ll_pois(k, lamb):   
    if np.sum(lamb<=0):
        return -np.inf
    
    return np.sum(k*np.log(lamb) - lamb - loggamma(k+1.))

#Normal log-likelihood, limit of Poisson for large lambda
#k: observed counts
#lamb: expected (model) counts
def ll_norm(k,lamb):
    if np.sum(lamb<=0):
        return -np.inf
    
    return np.sum(-0.5*np.log(2*np.pi*lamb) - (k-lamb)**2/(2*lamb))

#Log of flat prior functions
#theta: array of parameter values
#bounds: array of parameter bounds. shape should be len(theta)x2
def lp_flat(theta, bounds):
    #for itheta,ibounds in zip(theta,bounds):
    #    if not (ibounds[0] < itheta < ibounds[1]):
    #        return -np.inf
        
    #return 0.0
    
    if (np.array(bounds)[:,0]<theta).all() and (theta<np.array(bounds)[:,1]).all():
        return 0.0
    return -np.inf

#Log of normal prior distribution
#theta: parameter value(s)
#mu: parameter prior distribution mean(s)
#sigma: paramter prior distribution sigma(s)
def lp_norm(theta, mu, sigma):
    return np.sum(-0.5*((theta-mu)/sigma)**2 - np.log(sigma)-0.5*np.log(2*np.pi))

Construct the probability function. This contains much of the meat of this calculation.
It takes our model parameters and returns the resulting probability

In [57]:
#Calculate Log probability, log(likelihood*prior)
#
#theta: array of fit parameters (yield_par0, yield_par1, ...,  F_NR, scale_g4, scale_ng, ...)
#theta_bounds: paramter bounds, shape should be len(theta)x2
#spec_bounds: range of bin numbers in spectrum to consider. The analysis range is [bin_low,bin_high)
#likelihood: Likelihood function, either 'Pois' or 'Norm'

def calc_log_prob(theta=[0.2, 1, 1, 1], theta_bounds=((0,1),(0,10),(0,10),(0,10)), spec_bounds=(5,101),
                  likelihood='Pois'):

    #Access the global data
    #These must be already defined!!!
    global N_meas, tlive_PuBe, g4, cap, Y
    
    ############
    #Set some local variables
    nYpar=Y.npars
    
    Y.set_model(model)
    Y.set_pars(theta[:nYpar])
    F_NR=theta[nYpar]
    scale_g4=theta[nYpar+1]
    scale_ng=theta[nYpar+2]
    
    
    #Calculate the (log)prior first since we may not need to calculate the likelihood
    lp=lp_flat(theta, theta_bounds)
    if not np.isfinite(lp):
        return -np.inf
        
    
    ##########
    #Build the spectra
    #NR,ER,NG=spec.buildSimSpectra_ee(Ebins=Ebins, Evec_nr=g4['NR']['E'], Evec_er=g4['ER']['E'], Evec_ng=cap['E'], dEvec_ng=cap['dE'], 
                                         #Yield=Y, F_NR=F_NR, scale_g4=scale_g4, scale_ng=scale_ng, doDetRes=True, seed=1)

    #Avg spectra is slower, but more stable
    NR,ER,NG=spec.buildAvgSimSpectra_ee(Ebins=Ebins, Evec_nr=g4['NR']['E'], Evec_er=g4['ER']['E'], Evec_ng=cap['E'], dEvec_ng=cap['dE'],
                                        Yield=Y, F_NR=F_NR, scale_g4=scale_g4, scale_ng=scale_ng, doDetRes=True, fpeak=1)


    #Total counts for PuBe live time
    #Uncertainty will be sqrt(N)
    N_pred = (NR/g4['NR']['tlive'] + ER/g4['ER']['tlive'] + NG/cap['tlive'])*tlive_PuBe

    ##########
    #Calculate the log probability = log prior + log likelihood
    ll=None
    
    if likelihood=='Norm':
        ll = ll_norm(N_meas[slice(*spec_bounds)],N_pred[slice(*spec_bounds)])
    elif likelihood=='Pois':
        ll = ll_pois(N_meas[slice(*spec_bounds)],N_pred[slice(*spec_bounds)])
    else:
        print('Error: Bad likelihood')
        return None
    
    if not np.isfinite(ll):
        return -np.inf
    
    return lp + ll

In [52]:
#Set eVee energy binning
Emax = 2000 #eVee
Ebins=np.linspace(0,Emax,201)
mcmc_data['Emax']=Emax
mcmc_data['Ebins']=Ebins


#Measured spectra
N_meas_PuBe,_ = np.histogram(meas['PuBe']['E'],bins=Ebins)
N_meas_Bkg,_ = np.histogram(meas['Bkg']['E'],bins=Ebins)

tlive_PuBe = meas['PuBe']['tlive']
tlive_Bkg = meas['Bkg']['tlive']
#We'll scale everything to the PuBe live time and work with counts, not rate, to get the Poisson stats right

N_meas_Bkg_scaled = N_meas_Bkg * tlive_PuBe/tlive_Bkg

#Estimate of counts due to PuBe
N_meas = N_meas_PuBe - N_meas_Bkg_scaled

# Sorenson Fit

In [58]:
import emcee
from multiprocessing import Pool

#Sorenson fit
# theta = k, q, sim_scale, F_NR
# k: Lindhard k factor [unitless]
# q: Cutoff energy, in units of Lindhard epsilon (eps = 11.5*Er/1000*Z**(-7./3))
#labels_s = ['k', 'q', 'sim scale', 'F_{NR}']

Y=Yield.Yield('Sor',[0.2,2e-3])

#Wrapper help
#def SorFit_helper(theta):
#    return calc_log_prob(model='Sor', theta=theta, theta_bounds=((0,1),(0,3e-2),(0,10),(0,5)),
#                         spec_bounds=(5,101), likelihood='Pois')

# theta = k, q, F_NR, scale_g4, scale_ng,
labels_s = [r'k', r'q', r'$F_{NR}$', r'$scale_{G4}$', r'$scale_{ng}$']

#Wrapper help
#def SorFit_helper(theta):
#    return calc_log_prob(model='Sor', theta=theta, theta_bounds=((0,1),(0,3e-2),(0,10),(0,10),(0,10)),
#                         spec_bounds=(5,101), likelihood='Pois')

#Test full range fit
def SorFit_helper(theta):
    E_lim_min=50 #eVee
    E_lim_max=1.75e3 #eVee
    spec_bounds=(np.digitize(E_lim_min,Ebins)-1,np.digitize(E_lim_max,Ebins)-1)
                 
    return calc_log_prob(model='Sor', theta=theta, theta_bounds=((0,1),(0,3e-2),(0,10),(0,10),(0,10)),
                         spec_bounds=spec_bounds, likelihood='Pois')

In [None]:
#nwalkers_s = 8
#ndim_s = 4
#nstep_s = 5000

#guesses_s = np.array([0.18, 2e-3, 1.6, 3.8]) + np.array([1e-2, 1e-4, 1, 0.5]) * np.random.randn(nwalkers_s, ndim_s)

nwalkers_s = 16
ndim_s = 5
nstep_s = 5000

#guesses_s = np.array([0.18, 2e-3, 3.0, 1.0, 1.0]) + np.array([1e-2, 1e-4, 1, 0.1, 0.1]) * np.random.randn(nwalkers_s, ndim_s)

#Sample priors uniformly
tbs=np.array(((0,1),(0,3e-2),(0,10),(0,10),(0,10)))
guesses_s=(np.array(tbs)[:,1]-np.array(tbs)[:,0])*np.random.random_sample((nwalkers_s, ndim_s))+np.array(tbs)[:,0]

with Pool(processes=20) as pool: #Can fail depending on current memory usage of other processes...
    sampler_s = emcee.EnsembleSampler(nwalkers_s, ndim_s, SorFit_helper, pool=pool)
    sampler_s.run_mcmc(guesses_s, nstep_s, progress=True);

 80%|███████▉  | 3994/5000 [4:31:12<1:12:21,  4.32s/it]

In [None]:
#Save this work
import pickle as pkl
import os

saveMCMC=True

if saveMCMC:
    ifile = 0
    fname='data/mcmc_Sor_{0}walk_{1}step_pois_v{2}.pkl'.format(nwalkers_s,nstep_s,ifile+1)
    while os.path.exists(fname):
        ifile += 1
        fname='data/mcmc_Sor_{0}walk_{1}step_pois_v{2}.pkl'.format(nwalkers_s,nstep_s,ifile+1)
        
    print(fname)
    saveFile = open(fname, 'wb')
    
    results={'sampler':sampler_s, 'guesses': guesses_s, 'labels':labels_s}
    
    pkl.dump(results,saveFile)
    saveFile.close()

In [None]:
#Look at the chain of parameter values
fig, axes = plt.subplots(ndim_s, figsize=(10, 7), sharex=True)
samples_s = sampler_s.get_chain()
for i in range(ndim_s):
    ax = axes[i]
    ax.plot(samples_s[:, :, i], "k", alpha=0.3)
    ax.set_xlim(0, len(samples_s))
    #ax.set_ylim(0, 5)
    ax.set_ylabel(labels_s[i])
    ax.yaxis.set_label_coords(-0.1, 0.5)

axes[-1].set_xlabel("step number");

In [None]:
#Get the sample autocorrelation time
tau_s=sampler_s.get_autocorr_time()
print(tau_s)
avgtau_s=round(np.average(tau_s))
print(avgtau_s)

In [None]:
#Discard a few times tau as burn-in and thin by tau/2
avgtau_s=200
flat_samples_s = sampler_s.get_chain(discard=int(2.*avgtau_s), thin=int(round(avgtau_s/2.)), flat=True)
print(flat_samples_s.shape)

In [None]:
import corner
fig = corner.corner(flat_samples_s, labels=labels_s, quantiles=[0.16, 0.5, 0.84], show_titles=True, title_fmt='0.3f');