In [15]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from thermoextrap import *
import thermoextrap


In [16]:
# xtrapy stuff:
# note, just using single module right now.  Can retinker
import thermoextrap.xtrapy.core as core
import xarray as xr

In [17]:
from importlib import reload

In [18]:
order = 6
fs = [thermoextrap.symDerivAvgX(i) for i in range(order+1)]

In [248]:
# test Data
# samples
n = int(1e5)
# number of x values
nv = 5

_u_offset = 0.0
_x_offset = 0.0


np.random.seed(0)
u = np.random.rand(n) + _u_offset
x = np.random.rand(n, nv) + _x_offset

# second set, to testing
ub = np.random.rand(n) + _u_offset
xb = np.random.rand(n, nv) + _x_offset

# comparison of xtrapy

## Data averages

In [249]:
# Test "coefs"
ufunc, xufunc = thermoextrap.buildAvgFuncs(x, u, order)
coefs_list = [fs[i](ufunc, xufunc) for i in range(order+1)]

In [250]:
reload(core)

<module 'thermoextrap.xtrapy.core' from '/Users/wpk/Documents/python/projects/xtrapy/thermoextrap/xtrapy/core.py'>

In [251]:
# coefficient class handler
# raw moments

# coefficient builder
s = core.factory_coefs_beta()
# data object
data = core.factory_data(u, x, order=order, chunk=1000)

In [252]:
# Note very slight difference in averages.
# this is due to numerics
print(data.u - [ufunc(i) for i in range(order+1)])
print(data.xu - [xufunc(i) for i in range(order+1)])

<xarray.DataArray 'u' (moment: 5)>
array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       5.55111512e-17])
Dimensions without coordinates: moment
<xarray.DataArray 'x' (moment: 5, val: 5)>
array([[ 1.11022302e-15,  6.66133815e-16, -3.33066907e-16,
         6.77236045e-15, -4.77395901e-15],
       [-1.49880108e-15, -3.88578059e-16,  0.00000000e+00,
        -5.55111512e-17, -1.27675648e-15],
       [ 2.77555756e-17, -1.99840144e-15,  1.44328993e-15,
         1.05471187e-15,  2.49800181e-16],
       [ 4.71844785e-16,  9.43689571e-16,  1.44328993e-15,
         1.80411242e-16, -7.21644966e-16],
       [ 1.66533454e-15, -1.04083409e-15,  1.33226763e-15,
        -7.63278329e-16,  1.98452366e-15]])
Dimensions without coordinates: moment, val


## coefficients of expansion

In [253]:
# norm = True, include 1/ n!  factor
# if want straight coeffs, use norm=False
c = s.xcoefs(data, norm=False) 
c

In [254]:
# test equal to old way
np.testing.assert_allclose(c, coefs_list, rtol=1e-5)

In [255]:
# instead, can use central moments
s_c = core.factory_coefs_beta(central=True)

In [256]:
data_c = core.factory_data(u, x, order=order, central=True)

c_c = s_c.xcoefs(data_c, norm=False)
c_c

In [257]:
# test close
np.testing.assert_allclose(s.xcoefs(data), s_c.xcoefs(data_c))

# nice-ness of central moments

In [258]:
# coeffs using raw moments u[i], xu[i]
for i in range(order+1):
    print('{}: {}'.format(i, s.exprs[i]))

AttributeError: 'Coefs' object has no attribute 'exprs'

In [259]:
# coeffs using central moments du[i] = <(u-<u>)**i> 

for i in range(order+1):
    print('{}: {}'.format(i, s_c.exprs[i]))

AttributeError: 'Coefs' object has no attribute 'exprs'

In [73]:
# so not only are central moments more stable, the resulting expressions
# are way simplier!

# taking advantage of xarray

In [74]:
# make a dataset of two values:

x_set = xr.merge((core.xrwrap_xv(xx, name=name) 
                  for xx, name in zip([x, xb], ['a','b'])))
x_set

data_set = core.factory_data(u, x_set, order)
data_set_c = core.factory_data(u, x_set, order, central=True)


# need "other" x for comparison
datab = core.factory_data(u, xb, order)
datab_c = core.factory_data(u, xb, order, central=True)

In [75]:
# note that data_set has multiple measurements in it.  Very nice...
print(data_set.xv)
print(data_set.xu)

<xarray.Dataset>
Dimensions:  (rec: 100000, val: 5)
Dimensions without coordinates: rec, val
Data variables:
    a        (rec, val) float64 0.5353 0.904 0.5024 ... 0.7553 0.2138 0.7261
    b        (rec, val) float64 0.6237 0.1782 0.2007 ... 0.08856 0.0782 0.4235
<xarray.Dataset>
Dimensions:  (moment: 7, val: 5)
Dimensions without coordinates: moment, val
Data variables:
    a        (moment, val) float64 0.5004 0.499 0.5004 ... 0.07138 0.07166
    b        (moment, val) float64 0.5001 0.5015 0.5001 ... 0.07127 0.07149


In [76]:
ufuncb, xufuncb = thermoextrap.buildAvgFuncs(xb, u, order)
coefs_listb = [fs[i](ufuncb, xufuncb) for i in range(order+1)]

In [77]:
# make sure have same value
np.testing.assert_allclose(s.xcoefs(datab,  norm=False), coefs_listb)

In [78]:
# consider dataset
c_set = s.xcoefs(data_set, norm=False)
c_set

In [79]:
# testing
cb = s.xcoefs(datab, norm=False)
np.testing.assert_allclose(c_set.a, c)
np.testing.assert_allclose(c_set.b, cb)

In [80]:
# central
c_set_c = s_c.xcoefs(data_set_c, norm=False)
# test
xr.testing.assert_allclose(c_set, c_set_c)

# resampling

In [81]:
# resampling
data.resample(10).u

In [82]:
# testing resampling:
# use same resampler for all
idx = core.resample_indicies(len(data), nrep=10)

In [83]:
c_r = s.xcoefs(data.resample(nrep=None, idx=idx))
cb_r = s.xcoefs(datab.resample(None, idx))
cs_r = s.xcoefs(data_set.resample(None, idx))

In [84]:
np.testing.assert_allclose(cs_r.a, c_r)
np.testing.assert_allclose(cs_r.b, cb_r)

In [85]:
# center
cs_c_r = s_c.xcoefs(data_set_c.resample(None, idx))

In [86]:
xr.testing.assert_allclose(cs_r, cs_c_r)

# resampling time and chunking

In [87]:
# xarray is built on dask.  I'm no expert, that this speed things up
# use chunking to create a dask array of uv, xv

In [90]:
idx = core.resample_indicies(len(data), nrep=100)

In [95]:
%%timeit -n 1 -r 1
r = data.resample(None, idx, chunk=None)
r.u
r.xu

2.05 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [96]:
%%timeit -n 1 -r 1
r = data.resample(None, idx, chunk=10000)
r.u
r.xu

1.65 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Extrap model

In [118]:
reload(core)

<module 'thermoextrap.xtrapy.core' from '/Users/wpk/Documents/python/projects/xtrapy/thermoextrap/xtrapy/core.py'>

In [119]:
# test Extrap model
betas = [0.3, 0.4]
ref_beta = 0.5

In [120]:
# blank model
em = thermoextrap.ExtrapModel(maxOrder=order)

In [121]:
params = em.train(ref_beta, xData=x, uData=u, saveParams=True)

In [122]:
em.predict(betas, order=3)

array([[0.50043757, 0.49897351, 0.50042238, 0.49912602, 0.50117469],
       [0.50042413, 0.49898436, 0.50040218, 0.49912628, 0.50116313]])

In [123]:
xem = core.ExtrapModel.from_values_beta(order, ref_beta, u, x, xalpha=False, central=False)
xem.predict(betas, order=3)

In [124]:
# central, and build from "from_values" method
xem_c = core.ExtrapModel.from_values_beta(order, ref_beta, u, x, xalpha=False, central=True)
xem_c.predict(betas, order=3)

In [125]:
# resample
em.bootstrap(betas, n=20)

array([[0.00087799, 0.00078615, 0.00099371, 0.00063138, 0.0008918 ],
       [0.00087689, 0.00079002, 0.00098945, 0.00063244, 0.00088957]])

In [126]:
# resample data
xem_r = xem.resample(20)
xem_r.predict(betas).std('rep')

In [127]:
xem_c.resample(20).predict(betas).std('rep')

In [128]:
# note that xem_r is just an Extrap model with new resampled data
xem_r.data.u

In [129]:
# testing dataset
emb = thermoextrap.ExtrapModel(order, ref_beta, xb, u)

In [130]:
xemb = core.ExtrapModel.from_values_beta(order, ref_beta, u, xb, central=True)
xem_set = core.ExtrapModel.from_values_beta(order, ref_beta, u, x_set, central=True)

In [134]:
np.testing.assert_allclose(emb.predict(betas), emb.predict(betas))

In [133]:
# testing dataset
out = xem.predict(betas)
outb = xemb.predict(betas)
outs = xem_set.predict(betas)

np.testing.assert_allclose(out, outs.a)
np.testing.assert_allclose(outb, outs.b)

In [136]:
# resampling
idx = core.resample_indicies(len(x), 20)

out = xem.resample(None, idx).predict(betas)
outb = xemb.resample(None, idx).predict(betas)
outs = xem_set.resample(None, idx).predict(betas)

In [137]:
np.testing.assert_allclose(out, outs.a)
np.testing.assert_allclose(outb, outs.b)

In [138]:
# resampling much faster with chunking

In [141]:
%%timeit -n 1 -r 1
print(em.bootstrap(betas))

[[0.0008664  0.00083273 0.00094727 0.00096115 0.00092076]
 [0.00086488 0.00083089 0.00094839 0.00095933 0.00092211]]
4.69 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [146]:
%%timeit -n 1 -r 1
print(xem.resample(100, chunk=1000).predict(betas).std('rep'))

<xarray.DataArray (alpha: 2, val: 5)>
array([[0.00089334, 0.00082602, 0.00086824, 0.00082739, 0.00092418],
       [0.00089403, 0.0008265 , 0.00086752, 0.00082269, 0.00092349]])
Coordinates:
  * alpha    (alpha) float64 0.3 0.4
    dalpha   (alpha) float64 -0.2 -0.1
    alpha0   float64 0.5
Dimensions without coordinates: val
1.79 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# ExtrapWeighted

In [147]:
u = np.random.rand(2,n) + 5
x = np.random.rand(2,n, nv) + 10
xb = np.random.rand(2,n, 5) + 2
order = 4

In [148]:
beta0 = [0.05, 0.5]
betas = [0.1, 0.2, 0.3, 0.4]

In [149]:
emw = thermoextrap.ExtrapWeightedModel(order, beta0, x, u)
emwB = thermoextrap.ExtrapWeightedModel(order, beta0, xb, u)

In [153]:
reload(core)

<module 'thermoextrap.xtrapy.core' from '/Users/wpk/Documents/python/projects/xtrapy/thermoextrap/xtrapy/core.py'>

In [154]:
# for xtrapy, create list of models for each state:

# first create datasets for xs
xs = [xr.Dataset({'a': core.xrwrap_xv(xx), 
                  'b': core.xrwrap_xv(xxb)})
      for xx, xxb in zip(x, xb)
     ]

# now crete list of models for each state
states = [core.ExtrapModel.from_values_beta(order, b0, uu, xx, central=True)
         for b0, uu, xx in zip(beta0, u, xs)]

xemw = core.ExtrapWeightedModel(states)

In [158]:
states_a = [core.ExtrapModel.from_values_beta(order, b0, uu, xx, central=True)
             for b0, uu, xx in zip(beta0, u, x)]
xemw_a = core.ExtrapWeightedModel(states_a)

In [157]:
np.testing.assert_allclose(emw.predict(betas), xemw.predict(betas).a)
np.testing.assert_allclose(emwB.predict(betas), xemw.predict(betas).b)

In [None]:
idxs = [core.resample_indicies(len(xemw[0].data), 20) for i in range(2)]
a = xemw.resample(None, idxs).predict(betas).a
b = xemw_a.resample(None, idxs).predict(betas)

In [159]:
%%timeit -n 1 -r 1
emw.bootstrap(betas, n=100)

5.77 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [160]:
%%timeit -n 1 -r 1
# note that is is actually doing twice the work
# because data includes both x and xb
xemw.resample(100).predict(betas).std('rep')

7.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [161]:
%%timeit -n 1 -r 1
# note that is is actually doing twice the work
# because data includes both x and xb
xemw.resample(100, chunk=1000).predict(betas).std('rep')

5.75 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [162]:
%%timeit -n 1 -r 1
# note that is is actually doing twice the work
# because data includes both x and xb
xemw_a.resample(100, chunk=10000).predict(betas).std('rep')

3.69 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [163]:
from dask.diagnostics import ProgressBar

In [164]:
%%timeit -n 1 -r 1
# note that is is actually doing twice the work
# because data includes both x and xb
with ProgressBar():
    xemw_a.resample(500, chunk=10000).predict(betas).std('rep')

[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.9s
[########################################] | 100% Completed |  4.7s
[########################################] | 100% Completed |  0.4s
[########################################] | 100% Completed |  0.5s
[########################################] | 100% Completed |  2.5s
21.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# InterpModel

In [165]:
emi = thermoextrap.InterpModel(order, beta0, x, u)
emib = thermoextrap.InterpModel(order, beta0, xb, u)

In [166]:
xemi = core.InterpModel(states)

In [167]:
states_b = [core.ExtrapModel.from_values_beta(order, b0, uu, xx, central=True)
             for b0, uu, xx in zip(beta0, u, xb)]

xemi_a = core.InterpModel(states_a)
xemi_b = core.InterpModel(states_b)


In [170]:
alpha = [0.1, 0.2]

In [175]:
np.testing.assert_allclose(emi.predict(betas), xemi.predict(betas).a)
np.testing.assert_allclose(emib.predict(betas), xemi.predict(betas).b)

In [176]:
emi.bootstrap(betas, n=20)

array([[0.00087349, 0.00068129, 0.00118118, 0.00085011, 0.00070927],
       [0.00077745, 0.00061355, 0.0010338 , 0.00069158, 0.00059451],
       [0.00079354, 0.00070486, 0.00069287, 0.00068301, 0.00064373],
       [0.00107194, 0.00096932, 0.00073749, 0.00109109, 0.00099041]])

In [177]:
xemi.resample(20).predict(betas).std('rep').a

In [180]:
%%timeit -n 1 -r 1
emi.bootstrap(betas, n=100)

6.01 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [186]:
%%timeit -n 1 -r 1
xemi_a.resample(100, chunk=10000).predict(betas).std('rep')

3.15 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# MBAR

In [187]:
mbar = thermoextrap.MBARModel(refB=beta0, xData=x, uData=u)

In [188]:
mbar.predict(betas)

array([[10.49972623, 10.50010647, 10.49990363, 10.49914726, 10.49892917],
       [10.49972453, 10.50010473, 10.49991327, 10.49914194, 10.49891843],
       [10.49972216, 10.5001031 , 10.49992286, 10.49913709, 10.49890786],
       [10.49971909, 10.50010157, 10.4999324 , 10.49913273, 10.49889744]])

In [189]:
reload(core)

<module 'thermoextrap.xtrapy.core' from '/Users/wpk/Documents/python/projects/xtrapy/thermoextrap/xtrapy/core.py'>

In [190]:
xmbar = core.MBARModel(states_a)

In [191]:
out = xmbar.predict(betas)

In [192]:
out

In [193]:
np.testing.assert_allclose(mbar.predict(betas), xmbar.predict(betas))

In [194]:
import os

In [195]:
os.getpid()

63948

# Perturb

In [196]:
reload(core)

<module 'thermoextrap.xtrapy.core' from '/Users/wpk/Documents/python/projects/xtrapy/thermoextrap/xtrapy/core.py'>

In [197]:
beta_ref = 0.5

In [198]:
pm = thermoextrap.PerturbModel(beta_ref, x[0], u[0])

In [199]:
pm.predict([0.1, 0.2], useMBAR=False)

array([[10.50068176, 10.49987931, 10.50062532, 10.50060796, 10.49952804],
       [10.50066577, 10.49985426, 10.50063663, 10.50056739, 10.49951137]])

In [200]:
xpm = core.PerturbModel.from_values(beta_ref, u[0], x[0])

In [201]:
xpm.predict([0.1, 0.2])

# log func

In [202]:
from thermoextrap.utilities import buildAvgFuncs

#For quantities like the chemical potential, we're interested in the -log(<X>), not <X>
#Everything is the same, but we take derivatives differently
#Luckily, have closed-form expression for derivatives of -log(<X>) in terms of derivatives of <X>
#Specifically, d(n)[-log(<X>)]/dB(n) = Sum(k=1, n)[(k-1)! * (-1/<X>)^k * B(n,k,(d<X>/dB, ..., d(n-k+1)<X>/dB(n-k+1)))]
#B(n,k, (...)) represents Bell Polynomials, which are implemented in sympy (not numpy or scipy unfortunately)
#Create custom classes to handle this
from sympy import bell

class LogAvgExtrapModel(ExtrapModel):
    
    def calcDerivVals(self, refB, x, U):
        
        if x.shape[0] != U.shape[0]:
            print('First observable dimension (%i) and size of potential energy array (%i) do not match!'%(x.shape[0], U.shape[0]))
            return
    
        avgUfunc, avgXUfunc = buildAvgFuncs(x, U, self.maxOrder)
        derivVals = np.zeros((self.maxOrder+1, x.shape[1]))
        for o in range(self.maxOrder+1):
            if o == 0:
                derivVals[o] = (-np.log(avgXUfunc(0)))
                continue
            for k in range(1,o+1):
                #Get the derivatives of the average quantity
                thisDiffs = np.array([self.derivF[l](avgUfunc, avgXUfunc) for l in range(1, o-k+2)])
                #Loop to apply the chain rule to each element of the observable array
                for l in range(x.shape[1]):
                    derivVals[o,l] += np.math.factorial(k-1)*((-1/avgXUfunc(0)[l])**k)*bell(o, k, thisDiffs[:,l])
                
        return derivVals


In [203]:
betas

[0.1, 0.2, 0.3, 0.4]

In [204]:
xdata = xem.data.xv.values
udata = xem.data.uv.values
refBeta=0.5

In [205]:
#Create and train extrapolation model
extModelLog = LogAvgExtrapModel(maxOrder=4, refB=refBeta, 
                                xData=xdata,
                                uData=udata,
                                )

#Note that we handled the -log calculation in the definition of the derivatives (even at zeroth order).
#This means we want to just pass data, not the -log of the data.

#Check the parameters
print("Model parameters (derivatives):")
print(extModelLog.params)
print('\n')

#Finally, look at predictions
print("Model predictions:")
print(extModelLog.predict(betas, order=2))
print('\n')

Model parameters (derivatives):
[[ 6.92326220e-01  6.95158101e-01  6.92383757e-01  6.94896673e-01
   6.90846049e-01]
 [ 2.69773834e-04 -2.27673014e-04  4.07282201e-04  1.07002690e-05
   2.20961626e-04]
 [ 4.48585482e-06 -6.47472658e-05  2.02998635e-05  1.12768749e-04
  -6.90717840e-05]
 [-5.07388591e-05  5.69296180e-05 -5.54391284e-05  9.44529753e-05
  -6.20967030e-05]
 [ 1.05400889e-05  2.10718898e-05 -1.32468610e-05 -4.42767814e-05
   1.91285621e-05]]


Model predictions:
[[0.69221867 0.69524399 0.69222247 0.69490141 0.69075214]
 [0.69224549 0.69522349 0.69226249 0.69489854 0.69077665]
 [0.69227236 0.69520234 0.69230271 0.69489679 0.69080048]
 [0.69229927 0.69518054 0.69234313 0.69489617 0.69082361]]




In [230]:
reload(core)

<module 'thermoextrap.xtrapy.core' from '/Users/wpk/Documents/python/projects/xtrapy/thermoextrap/xtrapy/core.py'>

In [231]:
xem_log = core.ExtrapModel.from_values_beta(4, refBeta, udata, xdata, 
                                            xalpha=False, central=False, minus_log=True)

In [240]:
xem_log.xcoefs(norm=False) - extModelLog.params

In [241]:
xem_log.predict(betas, order=2)

In [243]:
xem_log.resample(100).predict(betas,order=2).std('rep')

In [242]:
#And bootstrapped uncertainties
print("Bootstrapped uncertainties in predictions:")
print(extModelLog.bootstrap(betas, order=2))

Bootstrapped uncertainties in predictions:
[[0.0019606  0.00188727 0.00195556 0.00175709 0.00168601]
 [0.00195445 0.00189193 0.00195472 0.00175012 0.0016689 ]
 [0.00194944 0.0018977  0.00195485 0.0017444  0.00165321]
 [0.00194558 0.00190457 0.00195593 0.00173991 0.00163897]]


In [598]:
sp.bell(4, 2, X)

4*X[0]*X[2] + 3*X[1]**2