In [None]:
%matplotlib notebook
import matplotlib as mpl
import matplotlib.pyplot as plt
import datetime
import pandas as pd
import numpy as np
import math
from sampling import NestedSampling, Uniform, Callback, Normal, SamplingException

from filterpy.kalman import UnscentedKalmanFilter as UKF
from filterpy.kalman import KalmanFilter as KF
from filterpy.kalman import MerweScaledSigmaPoints
from filterpy.common import Q_continuous_white_noise

import progressbar

from clemb.forward_model import forward_model, fullness, esol
import clemb

## Table of contents
---------------------------
1. [Data preparation](#Data preparation)
2. [Inversion](#Inversion)

## Data preparation

In [None]:
def common_date(date):
    """
    Function used for pandas group-by.
    If there are several measurements in 
    one day, take the mean.
    """
    ndt = pd.Timestamp(year=date.year,
                       month=date.month,
                       day=date.day)
    return ndt

In [None]:
def interpolate_mg(df, dt=1):
    """
    Inter- and extrapolate Mg++ measurements using a
    non-linear Kalman filter.
    """
    
    def f_x(x, dt):
        """
        Forward model exponential decay
        """
        _k = x[1]/1e3
        _dt = dt
        _y = x[0]
        if isinstance(dt, np.ndarray):
            _dt = dt[0]
        # 4th order Runge-Kutta
        k0 = -_k * _y * _dt
        k1 = -_k * (_y + 0.5 * k0) * _dt
        k2 = -_k * (_y + 0.5 * k1) * _dt
        k3 = -_k * (_y + k2) * _dt
        _y_next = _y + 1./6.*(k0 + 2 * k1 + 2 * k2 + k3)
        return np.array([_y_next, x[1]])

    def h_x(x):
        """
        Measurement function
        """
        return [x[0]]
    
    dts = np.r_[0, np.cumsum(np.diff(df.index).astype(int)/(86400*1e9))]
    dts = dts[:, np.newaxis]
    ny = df['obs'].values
    ny = np.where(np.isnan(ny), None, ny)
    
    points = MerweScaledSigmaPoints(n=2, alpha=.01, beta=2., kappa=1.)
    kf = UKF(dim_x=2, dim_z=1, dt=dt, fx=f_x, hx=h_x, points=points)
    kf.x = np.array([ny[0], .6])
    kf.Q = Q_continuous_white_noise(2, dt=dt, spectral_density=1e-5)
    kf.P = np.diag([100.**2, 3.**2])
    kf.R = 50.**2
    npoints = dts.size
    means = np.zeros((npoints-1, 2))
    covariances = np.zeros((npoints-1, 2, 2))
    for i, z_n in enumerate(ny[1:]):
        kf.predict()
        kf.update(z_n)
        means[i,:] = kf.x
        covariances[i, :, :] = kf.P
    Ms, P, K = kf.rts_smoother(means, covariances)
    y_new = np.r_[ny[0], Ms[:,0]]
    y_std = np.r_[50, np.sqrt(P[:, 0, 0])]
    return pd.DataFrame({'obs': y_new,
                         'obs_err': y_std,
                         'orig': df['obs'].values},
                        index=df.index)

In [None]:
def get_mg_data(tstart=None, tend=datetime.datetime.utcnow().strftime("%Y-%m-%d")):
    # Get Mg++ concentration
    url = "https://fits.geonet.org.nz/observation?siteID=RU003&typeID=Mg-w"
    names = ['obs', 'obs_err']
    mg_df = pd.read_csv(url, index_col=0, names=names, skiprows=1,
                        parse_dates=True)
    if tstart is not None:
        tstart = max(mg_df.index.min(), pd.Timestamp(tstart))
    else:
        tstart = mg_df.index.min()
    mg_df = mg_df.loc[(mg_df.index >= tstart) & (mg_df.index <= tend)]

    mg_df = mg_df.groupby(common_date, axis=0).mean()
    new_dates = pd.date_range(start=tstart, end=tend, freq='D')
    mg_df = mg_df.reindex(index=new_dates)
    # Find the first non-NaN entry
    tstart_min = mg_df.loc[~mg_df['obs'].isnull()].index[0]
    # Ensure the time series starts with a non-NaN value
    mg_df = mg_df.loc[mg_df.index >= tstart_min]
    return interpolate_mg(mg_df)

tstart = '2009-09-25'
df1 = get_mg_data(tstart=tstart)
df1.head()


In [None]:
plt.figure(figsize=(9,4))
plt.plot(df1.index, df1['obs'], 'k--')
plt.fill_between(df1.index, df1['obs']-3*df1['obs_err'],
                  df1['obs']+3*df1['obs_err'], alpha=0.5)
plt.plot(df1.index, df1['orig'], 'k+')


In [None]:
def interpolate_T(df, dt=1):
    dts = np.r_[0, np.cumsum(np.diff(df.index).astype(int)/(86400*1e9))]
    dts = dts[:, np.newaxis]
    ny = df['t'].values
    ny = np.where(np.isnan(ny), None, ny)
    
    kf = KF(dim_x=2, dim_z=1)
    kf.F = np.array([[1, 1], [0, 1]])
    kf.H = np.array([[1., 0]])
    if ny[1] is not None:
        dT0 = ny[1] - ny[0]
    else:
        dT0 = 0.
    kf.x = np.array([ny[0], dT0])
    kf.Q = Q_continuous_white_noise(2, dt=dt, spectral_density=3e-2)
    kf.P *= 1e-5**2
    kf.R = .5**2
    means, covariances, _, _ =kf.batch_filter(ny[1:])
    Ms, P, _, _ = kf.rts_smoother(means, covariances)
    y_new = np.r_[ny[0], Ms[:,0]]
    y_std = np.r_[.3, np.sqrt(P[:, 0, 0])]
    return pd.DataFrame({'t': y_new,
                         't_err': y_std,
                         't_orig': df['t'].values},
                         index=df.index)

In [None]:
def get_T(tstart=None, tend=datetime.datetime.utcnow().strftime("%Y-%m-%d")):        
    # Get temperature
    # Temperature has been recorded by 3 different sensors so 3 individual
    # requests have to be made
    url = "https://fits.geonet.org.nz/observation?siteID=RU001&typeID=t&methodID={}"
    names = ['t', 't_err']
    tdf1 = pd.read_csv(url.format('therm'),
                       index_col=0, names=names, skiprows=1,
                       parse_dates=True)
    tdf2 = pd.read_csv(url.format('thermcoup'),
                       index_col=0, names=names, skiprows=1,
                       parse_dates=True)
    tdf3 = pd.read_csv(url.format('logic'),
                       index_col=0, names=names, skiprows=1,
                       parse_dates=True)
    tdf3 = tdf3.combine_first(tdf2)
    t_df = tdf3.combine_first(tdf1)
    if tstart is not None:
        tstart = max(t_df.index.min(), pd.Timestamp(tstart))
    else:
        tstart = t_df.index.min()
    t_df = t_df.loc[(t_df.index >= tstart) & (t_df.index <= tend)]
    t_df = t_df.groupby(common_date, axis=0).mean()
    new_dates = pd.date_range(start=tstart, end=tend, freq='D')
    t_df = t_df.reindex(index=new_dates)
    # Find the first non-NaN entry
    tstart_min = t_df.loc[~t_df['t'].isnull()].index[0]
    # Ensure the time series starts with a non-NaN value
    t_df = t_df.loc[t_df.index >= tstart_min]
    return interpolate_T(t_df)

df2 = get_T(tstart=df1.index[0], tend=df1.index[-1])
#df2 = get_T()
df2.head()

In [None]:
plt.figure(figsize=(9,4))
plt.plot(df2.index, df2['t'], 'k--')
plt.fill_between(df2.index, df2['t']-3*df2['t_err'],
                  df2['t']+3*df2['t_err'], alpha=0.5)
plt.plot(df2.index, df2['t_orig'], 'k+')


In [None]:
def interpolate_ll(df, dt=1):
    dts = np.r_[0, np.cumsum(np.diff(df.index).astype(int)/(86400*1e9))]
    dts = dts[:, np.newaxis]
    ny = df['h'].values
    ny = np.where(np.isnan(ny), None, ny)
    
    kf = KF(dim_x=1, dim_z=1)
    kf.F = np.array([[1.]])
    kf.H = np.array([[1.]])
    if ny[1] is not None:
        dT0 = ny[1] - ny[0]
    else:
        dT0 = 0.
    kf.x = np.array([ny[0]])
    kf.Q = 1e-2**2
    kf.P = 0.03**2
    kf.R = 0.02**2
    means, covariances, _, _ =kf.batch_filter(ny[1:])
    Ms, P, _, _ = kf.rts_smoother(means, covariances)
    y_new = np.r_[ny[0], Ms[:,0]]
    y_std = np.r_[0.03, np.sqrt(P[:, 0, 0])]
    return pd.DataFrame({'h': y_new,
                         'h_err': y_std,
                         'h_orig': df['h'].values},
                         index=df.index)

In [None]:
def get_ll(tstart=None, tend=datetime.datetime.utcnow().strftime("%Y-%m-%d")):
    # Get lake level
    # The lake level data is stored with respect to the overflow level of
    # the lake. Unfortunately, that level has changed over time so to get
    # the absolute lake level altitude, data from different periods have to
    # be corrected differently. Also, lake level data has been measured by
    # different methods requiring several requests.
    url = "https://fits.geonet.org.nz/observation?siteID={}&typeID=z"
    names = ['h', 'h_err']
    ldf = pd.read_csv(url.format('RU001'),
                      index_col=0, names=names, skiprows=1,
                      parse_dates=True)
    ldf1 = pd.read_csv(url.format('RU001A'),
                       index_col=0, names=names, skiprows=1,
                       parse_dates=True)
    ll_df = ldf.combine_first(ldf1)
    ll_df.loc[ll_df.index < '1997-01-01', 'h'] = 2530. + \
        ll_df.loc[ll_df.index < '1997-01-01', 'h']
    ll_df.loc[(ll_df.index > '1997-01-01') & (ll_df.index < '2012-12-31'), 'h'] = 2529.5 + \
              (ll_df.loc[(ll_df.index > '1997-01-01') & (ll_df.index < '2012-12-31'), 'h'] - 1.3)
    ll_df.loc[ll_df.index > '2016-01-01', 'h'] = 2529.35 + (ll_df.loc[ll_df.index > '2016-01-01', 'h'] - 2.0)
    
    if tstart is not None:
        tstart = max(ll_df.index.min(), pd.Timestamp(tstart))
    else:
        tstart = ll_df.index.min()
    ll_df = ll_df.loc[(ll_df.index >= tstart) & (ll_df.index <= tend)]
    ll_df = ll_df.groupby(common_date, axis=0).mean()
    new_dates = pd.date_range(start=tstart, end=tend, freq='D')
    ll_df = ll_df.reindex(index=new_dates)
    # Find the first non-NaN entry
    tstart_min = ll_df.loc[~ll_df['h'].isnull()].index[0]
    # Ensure the time series starts with a non-NaN value
    ll_df = ll_df.loc[ll_df.index >= tstart_min]
    return interpolate_ll(ll_df)

df3 = get_ll(tstart=df1.index[0], tend=df1.index[-1])
df3.head()

In [None]:
plt.figure(figsize=(9,4))
plt.plot(df3.index, df3['h'], 'k--')
plt.fill_between(df3.index, df3['h']-3*df3['h_err'],
                  df3['h']+3*df3['h_err'], alpha=0.5)
plt.plot(df3.index, df3['h_orig'], 'k+')


In [None]:
def derived_obs(df1, df2, df3, nsamples=100):
    """
    Compute absolute amount of Mg++, volume, lake aread,
    water density and lake mass using Monte Carlo sampling
    """
    rn1 = np.random.randn(df1['obs'].size, nsamples)
    rn1 = rn1*np.tile(df1['obs_err'].values, (nsamples,1)).T + np.tile(df1['obs'].values, (nsamples,1)).T

    rn2 = np.random.randn(df3['h'].size, nsamples)
    rn2 = rn2*np.tile(df3['h_err'].values, (nsamples, 1)).T + np.tile(df3['h'].values, (nsamples,1)).T
    a, vol = fullness(rn2)
    X = rn1*vol
    
    p_mean = 1.003 - 0.00033 * df2['t'].values
    p_std = 0.00033*df2['t_err'].values
    
    rn3 = np.random.randn(p_mean.size, nsamples)
    rn3 = rn3*np.tile(p_std, (nsamples, 1)).T + np.tile(p_mean, (nsamples,1)).T
    M = rn3*vol
    return (X.mean(axis=1), X.std(axis=1), vol.mean(axis=1), vol.std(axis=1),
            p_mean, p_std, M.mean(axis=1), M.std(axis=1),
            a.mean(axis=1), a.std(axis=1))
X, X_err, v, v_err, p, p_err, M, M_err, a, a_err = derived_obs(df1, df2, df3, nsamples=20000)

In [None]:
df = pd.DataFrame({'T': df2['t'],
                   'T_err': df2['t_err'],
                   'z': df3['h'],
                   'z_err': df3['h_err'],
                   'Mg': df1['obs'],
                   'Mg_err': df1['obs_err'],
                   'X': X,
                   'X_err': X_err,
                   'v': v,
                   'v_err': v_err,
                   'a': a,
                   'a_err': a_err,
                   'p': p,
                   'p_err': p_err,
                   'M': M,
                   'M_err': M_err})
df

In [None]:
df.to_hdf('measurements.h5','table')

## Inversion

In [None]:
df = pd.read_hdf('measurements.h5', 'table')

In [None]:
class PyCallback(Callback):
    """
    Callback function to compute the log-likelihood for nested sampling.
    """
    
    def __init__(self):
        Callback.__init__(self)
        
    def set_data(self, y0, y1, cov, vol, a, solar, dt, ws):
        self.y0 = y0
        self.y1 = y1
        self.ws = ws
        self.vol = vol
        self.a = a
        self.solar = solar
        self.dt = dt
        self.cov = cov
        # the precision matrix is the inverse of the 
        # covariance matrix
        self.prec = np.linalg.inv(cov)
        self.factor = -np.log(np.power(2.*np.pi, self.cov.shape[0]) + np.sqrt(np.linalg.det(self.cov)))
        self.y_new = None

    def run(self, vals):
        try:
            Q_in = vals[0]*0.0864
            M_melt = vals[1]
            Mout = vals[2]*86400.*1.e-6 # convert l/s into 1e3 m^3/day
            H = 6.

            y_new, steam, mevap = forward_model(self.y0, dt, self.a, self.vol, Q_in, 
                                     M_melt, Mout, self.solar, H, self.ws)
            self.y_new = y_new
            lh = self.factor - 0.5*np.dot(y_new-self.y1,np.dot(self.prec,y_new-self.y1))
        except:
            raise SamplingException("Oh no!")
        return lh



In [None]:
startdate = '2018-02-01'

ndf = df.loc[df.index >= startdate]
datetime = ndf.index

nsteps = ndf.shape[0] - 1
dt = 1.

nsamples = 4000
nresample = 100
nparams = 3

qin = Uniform('qin', 0, 800)
m_in = Uniform('m_in', 0, 20)
#h = Normal('h', 6., .5)
m_out = Uniform('m_out', 30, 80)
ws = 4.5

# return values
qin_samples = np.zeros((nsteps, nresample))
m_in_samples = np.zeros((nsteps, nresample))
m_out_samples = np.zeros((nsteps, nresample))
h_samples = np.zeros((nsteps, nresample))
lh = np.zeros((nsteps, nresample))
exp = np.zeros((nsteps, nparams))
var = np.zeros((nsteps, nparams))
model_data = np.zeros((nsteps, nresample, 3))
steam = np.zeros((nsteps, nresample))
mevap = np.zeros((nsteps, nresample))

ns = NestedSampling()
pycb = PyCallback()
pycb.__disown__()
ns.setCallback(pycb)

with progressbar.ProgressBar(max_value=nsteps-1) as bar:
    for i in range(nsteps):
        a = ndf['a'][i]
        vol = ndf['v'][i]
        solar = esol(i*dt, a, datetime)
        # Take samples from the input
        T = ndf['T'][i]
        T_sigma = ndf['T_err'][i+1]
        M = ndf['M'][i]
        M_sigma = ndf['M_err'][i+1]
        X = ndf['X'][i]
        X_sigma = ndf['X_err'][i+1]
        cov = np.array([[T_sigma, 0., 0.],[0., M_sigma, 0.], [0., 0., X_sigma]])
        T_next = ndf['T'][i+1]
        M_next = ndf['M'][i+1]
        X_next = ndf['X'][i+1]

        y = np.array([T, M, X])
        y_next = np.array([T_next, M_next, X_next])
        pycb.set_data(y, y_next, cov, vol, a, solar, dt, ws)
        rs = ns.explore(vars=[qin, m_in, m_out], initial_samples=100,
                        maximum_steps=nsamples)
        smp = rs.resample_posterior(nresample)
        exp[i,:] = rs.getexpt()
        var[i,:] = rs.getvar()
        for j,_s in enumerate(smp):
            Q_in = _s._vars[0].get_value()
            M_in = _s._vars[1].get_value()
            M_out = _s._vars[2].get_value()
            #H = _s._vars[3].get_value()
            y_mod, st, me = forward_model(y, dt, a, vol, Q_in*0.0864, 
                                          M_in, M_out, solar, 6., ws)
            steam[i, j] = st
            mevap[i, j] = me
            model_data[i, j, :] = y_mod
            qin_samples[i, j] = Q_in
            m_in_samples[i, j] = M_in
            m_out_samples[i, j] = M_out
            #h_samples[i, j] = H
            lh[i, j] = np.exp(_s._logL)
        del smp
        bar.update(i)


In [None]:
if False:
    c = clemb.Clemb(clemb.LakeDataFITS(), clemb.WindDataCSV(), start=startdate, end=ndf.index[-1])
    rp = c.run([0,1])
    rp.to_hdf('clemb_output.h5','table')
else:
    rp = pd.read_hdf('clemb_output.h5', 'table')
pwr = rp.loc[0,'pwr']
melt = rp.loc[0,'fmelt']
orig_mevap = rp.loc[0, 'evfl']
orig_steam = rp.loc[0, 'steam']

In [None]:
mpl.rcParams['figure.subplot.hspace'] = 0.5
fig, axs = plt.subplots(nrows=5, ncols=2, figsize=(10, 8))

axs[0,0].plot(np.arange(nsteps+1), np.ones(ndf.index.size)*4.5, ls='--')
axs[0,0].set_title('Wind speed [m/s]')

axs[0,1].plot(np.arange(nsteps+1), ndf['X'], ls='--')

axs[0,1].fill_between(np.arange(nsteps+1), ndf['X']-3*ndf['X_err'],
                      ndf['X']+3*ndf['X_err'], alpha=0.5)
axs[0,1].plot(np.arange(nsteps), model_data[:,:,2].mean(axis=1))
axs[0,1].set_title('Mg++ amount')

axs[1,0].plot(np.arange(nsteps+1), ndf['T'], ls='--')
axs[1,0].fill_between(np.arange(nsteps+1), ndf['T']-3*ndf['T_err'],
                      ndf['T']+3*ndf['T_err'], alpha=0.5)
#for k in range(nsteps):
#    axs[1,0].scatter(np.ones(nresample)*k, model_data[k,:,0], s=2, c=lh[k],
#                     cmap=plt.cm.get_cmap('RdBu_r'), alpha=0.3)
axs[1,0].plot(np.arange(nsteps), model_data[:,:,0].mean(axis=1), 'k-')
axs[1,0].plot(np.arange(nsteps),
              model_data[:,:,0].mean(axis=1)+model_data[:,:,0].std(axis=1),
             'k--')
axs[1,0].plot(np.arange(nsteps),
              model_data[:,:,0].mean(axis=1)-model_data[:,:,0].std(axis=1),
              'k--')
axs[1,0].set_title('Lake temperature')

axs[1,1].plot(np.arange(nsteps+1), ndf['M'], ls='--')
axs[1,1].fill_between(np.arange(nsteps+1), ndf['M']-3*ndf['M_err'],
                      ndf['M']+3*ndf['M_err'], alpha=0.5)
axs[1,1].plot(np.arange(nsteps), model_data[:,:,1].mean(axis=1))
axs[1,1].set_title('Lake mass')

for k in range(nsteps):
    axs[2,0].scatter(np.ones(nresample)*k, qin_samples[k], s=2, c=lh[k],
                     cmap=plt.cm.get_cmap('RdBu_r'), alpha=0.3)
axs[2,0].plot(np.arange(nsteps), exp[:,0], 'k')
axs[2,0].plot(np.arange(nsteps), exp[:,0] - 3*np.sqrt(var[:,0]), 'k--')
axs[2,0].plot(np.arange(nsteps), exp[:,0] + 3*np.sqrt(var[:,0]), 'k--')
axs[2,0].plot(np.arange(nsteps), pwr.values, 'b-')
axs[2,0].set_title('Heat input rate')

if False:
    for k in range(nsteps):
        axs[2,1].scatter(np.ones(nresample)*k, h_samples[k], s=2, c=lh[k],
                    cmap=plt.cm.get_cmap('RdBu_r'), alpha=0.3)
    axs[2,1].plot(np.arange(nsteps), exp[:,3], 'k')
    axs[2,1].plot(np.arange(nsteps), exp[:,3] - 3*np.sqrt(var[:,3]), 'k--')
    axs[2,1].plot(np.arange(nsteps), exp[:,3] + 3*np.sqrt(var[:,3]), 'k--')
axs[2,1].plot(np.arange(nsteps+1), np.ones(ndf.index.size)*6.0, ls='--')
axs[2,1].set_title('Enthalpy')

for k in range(nsteps):
    axs[3,0].scatter(np.ones(nresample)*k, m_in_samples[k], s=2, c=lh[k],
                cmap=plt.cm.get_cmap('RdBu_r'), alpha=0.3)
axs[3,0].plot(np.arange(nsteps), exp[:,1], 'k')
axs[3,0].plot(np.arange(nsteps), exp[:,1] - 3*np.sqrt(var[:,1]), 'k--')
axs[3,0].plot(np.arange(nsteps), exp[:,1] + 3*np.sqrt(var[:,1]), 'k--')
axs[3,0].plot(np.arange(nsteps), melt.values, 'b-')
axs[3,0].set_title('Inflow')

for k in range(nsteps):
    axs[3,1].scatter(np.ones(nresample)*k, m_out_samples[k], s=2, c=lh[k],
                cmap=plt.cm.get_cmap('RdBu_r'), alpha=0.3)
#axs[3,1].plot(np.arange(nsteps), exp[:,3], 'k')
#axs[3,1].plot(np.arange(nsteps), exp[:,3] - 3*np.sqrt(var[:,3]), 'k--')
#axs[3,1].plot(np.arange(nsteps), exp[:,3] + 3*np.sqrt(var[:,3]), 'k--')
axs[3,1].set_title('Outflow')

for k in range(nsteps):
    axs[4,0].scatter(np.ones(nresample)*k, steam[k], s=2, c=lh[k],
                     cmap=plt.cm.get_cmap('RdBu_r'), alpha=0.3)
axs[4,0].plot(np.arange(nsteps), orig_steam.values, 'b-')
axs[4,0].set_title('Steam input')
    
for k in range(nsteps):
    axs[4,1].scatter(np.ones(nresample)*k, mevap[k], s=2, c=lh[k],
                     cmap=plt.cm.get_cmap('RdBu_r'), alpha=0.3)
axs[4,1].plot(np.arange(nsteps), orig_mevap.values, 'b-')
axs[4,1].set_title('Evaporation mass loss')

