## Traditional model
We build a traditional MSM using the usual tICA $\to$ $k$-Means $\to$ MSM $\to$ HMM/PCCA approach.

In [None]:
from collections import defaultdict
import gc
from glob import glob
from multiprocessing import Pool
import itertools
import pickle
import os
from typing import List, Tuple, Sequence
import warnings

import matplotlib.pyplot as plt
from matplotlib import rc, ticker
from matplotlib.colors import ListedColormap
from msmbuilder.cluster import MiniBatchKMeans, GMM, MiniBatchKMedoids, AgglomerativeClustering
from msmtools.analysis import stationary_distribution, mfpt
from msmtools.flux import tpt
import mdtraj as md
import numpy as np
import pandas as pd
import pyemma as pe
from scipy.linalg import eig
from scipy.stats import gaussian_kde
from sklearn.model_selection import KFold
import seaborn as sns
import xarray as xr

# Plot settings
sns.set_palette("husl", 8)
rc("font", **{"family": "Helvetica",
              "sans-serif": ["Helvetica"]})
rc("svg", **{"fonttype": "none"})
colors = sns.color_palette("husl", 8)

warnings.filterwarnings('ignore')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
def unflatten(source: np.ndarray, lengths: List[int]) -> List[np.ndarray]:
    """
    Takes an array and returns a list of arrays.
    
    Parameters
    ----------
    source
        Array to be unflattened.
    lengths
        List of integers giving the length of each subarray.
        Must sum to the length of source.
    
    Returns
    -------
    unflat
        List of arrays.
    
    """
    conv = []
    lp = 0
    for arr in lengths:
        arrconv = []
        for le in arr:
            arrconv.append(source[lp:le + lp])
            lp += le
        conv.append(arrconv)
    ccs = list(itertools.chain(*conv))
    return ccs

In [None]:
def sort_lengths(flatlengths: Sequence[int], shapes: Sequence[int]) -> List[List[int]]:
    """
    Takes a list of lengths and returns a list of lists of lengths.
    
    Parameters
    ----------
    flatlengths
        List of lengths
    shapes
        List of shapes
    
    Returns
    -------
    lengths
        List of lists of lengths
    
    """
    lengths = []
    i = 0
    for n in shapes:
        arr = []
        for _ in range(n):
            arr.append(flatlengths[i])
            i += 1
        lengths.append(arr)
    return lengths

In [None]:
def magnitudes(mag, step=1):
    return np.concatenate([np.arange(1, 10, step) * (10 ** i) for i in mag])

## Data
### Trajectories
Trajectories were acquired in five rounds of 1024 simulations each, totalling 5119 runs (one simulation failed to run) at 278 K in the $NVT$ ensemble. Postprocessing involved removing water, subsampling to 250 ps timesteps, and making molecules whole.

In [None]:
trajs = sorted(glob("trajectories/red/r?/traj*.xtc"))
top = "trajectories/red/topol.gro"
KBT = 2.311420 # 278 K
traj_rounds = [1024, 2047, 3071, 4095, 5119]
nres = 42
dt = 0.25

# This is only really necessary for the residues in the plots
topo = md.load_topology(top)
lags = np.array([1, 2, 5, 10, 20, 50, 100])

We use minimum distances as features for the neural network:

In [None]:
feat = pe.coordinates.featurizer(top)
feat.add_residue_mindist()
inpcon = pe.coordinates.source(trajs, feat)

# Switch for full version:
# lengths = sort_lengths(inpcon.trajectory_lengths(), [1024, 1023, 1024, 1024, 1024])
lengths = [inpcon.trajectory_lengths()]
nframes = inpcon.trajectory_lengths().sum()

In [None]:
print("Trajectories: {0}".format(len(trajs)))
print("Frames: {0}".format(nframes))
print("Time: {0:5.3f} µs".format(inpcon.trajectory_lengths().sum() * 0.00025))

In [None]:
allpairs = np.asarray(list(itertools.combinations(range(nres), 2)))
filename = "intermediate/mindist-all-red.npy"
if os.path.exists(filename):
    print("Loading existing file for ensemble: {0}".format(filename))
    mindist_flat = np.load(filename)
else:
    print("No mindist file for ensemble, calculating from scratch...")
    feat = pe.coordinates.featurizer(top)
    feat.add_residue_mindist(residue_pairs=allpairs)
    inpmindist = pe.coordinates.source(trajs, feat)
    mindist_flat = np.vstack(inpmindist.get_output())
    np.save(filename, mindist_flat)
mindist = unflatten(mindist_flat, lengths)

## Structure
### TICA
Time-lagged independent component analysis is a special case of Koopman operator estimation using a linear projection [1]. We solve the following generalized eigenvalue problem:

$$ \mathbf{C}_{01}v = \lambda \mathbf{C}_{00} v $$

The eigenvectors encode the slowest dynamics of the system, and we use them as a convenient visualization technique.

[1]	Pérez-Hernández, G., Paul, F., Giorgino, T., De Fabritiis, G. & Noé, F. Identification of slow molecular order parameters for Markov model construction. The Journal of Chemical Physics 139, 015102–14 (2013).

In [None]:
ticacon = pe.coordinates.tica(mindist, lag=20, dim=-1, kinetic_map=True)
ticscon = ticacon.get_output()
ycon = np.vstack(ticscon)

print("tIC Dimensions: {0}".format(ycon.shape[1]))
print("Required dimensions for 90 %: {0}".format(ticacon.cumvar[ticacon.cumvar < 0.9].shape[0]))

#### Free energy surface
We also show the free energy surface projected onto the two slowest tICs in the form of a kernel density estimate:

In [None]:
kernel = gaussian_kde(ycon[::10, :2].T)
xmin, ymin, *_ = ycon.min(axis=0)
xmax, ymax, *_ = ycon.max(axis=0)
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
posi = np.vstack((X.ravel(), Y.ravel()))
Z = kernel(posi).reshape(X.shape)
mat = np.rot90(Z.copy())
mat[mat < 0.01] = np.nan

In [None]:
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
cmap = "plasma"
F = -KBT * np.log(Z)
F -= F.min()
ax.contourf(X, Y, F, np.arange(0.0, 10, 1), cmap=cmap)
ax.contour(X, Y, F, np.arange(0.0, 10, 1), cmap=cmap, linewidth=10)
ax.tick_params(labelsize=24)
ax.set_xlabel(r"tIC 1", fontsize=24, labelpad=10)
ax.set_ylabel(r"tIC 2", fontsize=24, labelpad=10)
sns.despine(ax=ax)

### Hyperparameter scan
We will first scan through hyperparameters, in particular we will look at the dependence of the model on the number of tICs, the number of clusters, and the clustering algorithm itself:

In [None]:
kf = KFold(n_splits=5, shuffle=True)
n_clusters = [10, 20, 50, 100, 200, 500, 1000, 2000]

def cv(ntic, clust):
    results = []
    fulltics = [np.ascontiguousarray(tic[:, :ntic]) for tic in ticscon]
    for n_clust in n_clusters:
        model = pe.msm.MaximumLikelihoodMSM(lag=50)
        for fold, (train_inds, test_inds) in enumerate(kf.split(fulltics)):
            print("{0} clusters, fold {1}".format(n_clust, fold), end="\r")
            train = [fulltics[i] for i in train_inds]
            test = [fulltics[i] for i in test_inds]

            cluster = clust(n_clust)
            cluster.fit([np.ascontiguousarray(t[::50]) for t in train])
            cluster_train = cluster.predict([np.ascontiguousarray(t) for t in train])
            cluster_test = cluster.predict([np.ascontiguousarray(t) for t in test])

            model.fit(cluster_train)
            train_score = model.score(cluster_train)
            test_score = model.score(cluster_test)

            results.append(dict(train=train_score, test=test_score, n_clust=n_clust, fold=fold))
    return results

res = {}
for ntic in [2, 4, 8, 16, 32]:
    res[ntic] = {}
    for clust in [MiniBatchKMeans, MiniBatchKMedoids, GMM]:
        print("{0} {1}".format(ntic, clust.__name__))
        try:
            res[ntic][clust.__name__] = cv(ntic, clust)
        except MemoryError:
            print("Memory error, continuing...")

for k, v in res.items():
    with open("intermediate/scan-{0}-tic.pkl".format(k), "wb") as f:
        pickle.dump(v, f)

In [None]:
# Load, if necessary
res = {}
for k in [2, 4, 8, 16, 32]:
    with open("intermediate/scan-{0}-tic.pkl".format(k), "rb") as f:
        res[k] = pickle.load(f)

In [None]:
arr = np.array([[fold["train"], fold["test"]]
                for ntic in res.values()
                for kind in ntic.values()
                for fold in kind]).reshape(5, 3, 8, 5, 2)

with pd.ExcelWriter("intermediate/si-fig-1.xlsx") as xls:
    da = xr.DataArray(data=arr,
                      dims=("n_tics", "algorithm", "n_clusters", "folds", "type"),
                      coords=dict(n_tics=[2, 4, 8, 16, 32],
                                  algorithm=[clust.__name__ for clust in
                                             [MiniBatchKMeans, MiniBatchKMedoids, GMM]],
                                  n_clusters=[10, 20, 50, 100, 200, 500, 1000, 2000],
                                  type=["train", "test"]),
                      name="Hyperparameter scan")
    (da.to_dataset("n_clusters")
       .to_dataframe()
       .to_excel(xls, sheet_name="Hyperparameter scan"))

In [None]:
nx, ny = len(res[8].keys()), len(res.keys())
fig, axes = plt.subplots(ny, nx, figsize=(4 * nx, 4 * ny),
                         sharex=True, sharey=True)
for i, ktic in enumerate(sorted(res.keys())):
    for j, kmodel in enumerate(sorted(res[ktic].keys())):
        ax = np.atleast_2d(axes)[i, j]
        data = pd.DataFrame(res[ktic][kmodel])
        meds = data.groupby("n_clust").median()
        best_n = meds["test"].argmax()
        score = meds.loc[best_n, "test"]
        ax.plot(meds.index, meds["train"], linewidth=2, color=colors[4], label="Train")
        ax.plot(meds.index, meds["test"], linewidth=2, color=colors[5], label="Test")
        ax.scatter(data["n_clust"], data["train"], marker="o", color=colors[4], alpha=0.5, label=None)
        ax.scatter(data["n_clust"], data["test"], marker="o", color=colors[5], alpha=0.5, label=None)
        ax.plot(best_n, score, marker="*", color="black", markersize=20)
        ax.text(10, 1, r"$MVC = {0:4.3}$".format(score), fontsize=24)
        ax.set_xscale("log")
        ax.set_xticks(magnitudes([1, 2, 3, 4]))
        ax.set_xlim(5, 5000)
        if i == 0:
            ax.set_title(kmodel, fontsize=24)
        if i == ny - 1:
            ax.set_xlabel("# Clusters", fontsize=24)
        if j == 0:
            ax.set_ylabel("Score", fontsize=24)
        if j == 2:
            ax.yaxis.set_label_position("right")
            ax.set_ylabel("{0} tICs".format(ktic), fontsize=24, labelpad=20)
        ax.set_ylim(0, 10)
        ax.tick_params(labelsize=24)
fig.savefig("figs/trad-mvcscan.pdf", bbox_inches="tight", transparent=True)
fig.savefig("figs/trad-mvcscan.svg", bbox_inches="tight", transparent=True)

We can now build a fine grained model using the best hyperparameters (*k*-Means, 200 clusters, 16 tICs) and evaluate the implied timescales:

In [None]:
n = 4
n_clust, n_tic = 200, 16
cluster = MiniBatchKMeans(n_clust)
cluster.fit([ycon[::50, :n_tic]])
dtrajs = cluster.predict([t[:, :n_tic] for t in ticscon])
its = pe.msm.timescales_msm(dtrajs, lags=lags, n_jobs=8, errors="bayes")

In [None]:
with pd.ExcelWriter("intermediate/si-fig-2-a.xlsx") as xls:
    da = xr.DataArray(data=np.stack([its.get_timescales(), its.sample_mean, its.sample_std]),
                      dims=("kind", "lagtimes", "timescales"),
                      coords=dict(kind=["timescales", "sample mean", "sample std"]),
                      name="Implied timescales")
    (da.to_dataset("lagtimes")
       .to_dataframe()
       .to_excel(xls, sheet_name="Timescales 200 clusters"))

In [None]:
cfl = its.sample_mean - its.sample_std
cfu = its.sample_mean + its.sample_std
mits = its.get_timescales()
fig = plt.figure(figsize=(4, 4))
ax = fig.add_subplot(111)
for i in range(8):
    ax.semilogy(its.lags * dt, mits[:, i] * dt, marker="o", linewidth=1.5, color=colors[i])
    ax.plot(its.lags * dt, its.sample_mean[:, i] * dt, marker="o",
            linewidth=1.5, color=colors[i], linestyle="dashed")
    ax.fill_between(its.lags * dt, cfl[:, i] * dt, cfu[:, i] * dt,
                    interpolate=True, color=colors[i], alpha=0.2)
ax.plot(its.lags * dt, its.lags * dt, color="k")
ax.fill_between(its.lags * dt, ax.get_ylim()[0] * np.ones(lags.shape[0]),
                its.lags * dt, color="k", alpha=0.2)
ax.set_yticks(magnitudes([0, 1, 2, 3, 4, 5]))
ax.set_ylim(1, 100000)
sns.despine(ax=ax)
ax.set_xlabel(r"$\tau$ [ns]", fontsize=24)
ax.set_ylabel(r"$t_i$ [ns]", fontsize=24)
ax.tick_params(labelsize=24)
fig.savefig("figs/its-trad-{0}-{1}-km.pdf".format(n_clust, n_tic), bbox_inches="tight", transparent=True)
fig.savefig("figs/its-trad-{0}-{1}-km.svg".format(n_clust, n_tic), bbox_inches="tight", transparent=True)

We can now attempt coarse-graining using Perron cluster-cluster analysis (PCCA):

In [None]:
outsizes = np.array([2, 3, 4, 5, 6])
n_samples = 20
its = {n: np.zeros((lags.shape[0], n - 1, n_samples)) for n in outsizes}
for i, tau in enumerate(lags):
    msm = pe.msm.bayesian_markov_model(dtrajs, lag=tau)
    for n in outsizes:
        print("lag={0}, n={1}".format(tau, n), end="\r")
        for j, idx in enumerate(np.random.randint(100, size=n_samples)):
            pcca = msm.samples[idx].pcca(n)
            lambdas = np.linalg.eigvals(pcca.coarse_grained_transition_matrix)[1:]
            ts = -tau * dt / np.log(lambdas)
            ts[(ts < 0.0) | (ts > 1e5)] = np.nan
            its[n][i, :, j] = ts

In [None]:
for n in outsizes:
    itsm = np.nanmean(its[n], axis=-1)
    itsl, itsh = np.nanpercentile(its[n], q=(2.5, 97.5), axis=-1)
    fig = plt.figure(figsize=(4, 4))
    ax = fig.add_subplot(111)
    for i in range(n - 1):
        ax.semilogy(lags * dt, itsm[:, i], marker="o", linewidth=1.5, color=colors[i])
        ax.plot(lags * dt, itsm[:, i], marker="o",
                linewidth=1.5, color=colors[i], linestyle="dashed")
        ax.fill_between(lags * dt, itsl[:, i], itsh[:, i],
                        interpolate=True, color=colors[i], alpha=0.2)
    ax.plot(lags * dt, lags * dt, color="k")
    ax.fill_between(lags * dt, ax.get_ylim()[0] * np.ones(lags.shape[0]),
                    lags * dt, color="k", alpha=0.2)
    ax.set_yticks(magnitudes([0, 1, 2, 3, 4, 5]))
    ax.set_ylim(1, 100000)
    sns.despine(ax=ax)
    ax.set_xlabel(r"$\tau$ [ns]", fontsize=24)
    ax.set_ylabel(r"$t_i$ [ns]", fontsize=24)
    ax.tick_params(labelsize=24)
    fig.savefig("figs/its-pcca-1e5-{0}.pdf".format(n), bbox_inches="tight", transparent=True)
    fig.savefig("figs/its-pcca-1e5-{0}.svg".format(n), bbox_inches="tight", transparent=True)

In [None]:
with pd.ExcelWriter("intermediate/si-fig-2-e-f.xlsx") as xls:
    for n in [4, 6]:
        da = xr.DataArray(data=its[n],
                          dims=("lagtimes", "timescales", "attempts"),
                          name="Implied timescales PCCA")
        (da.to_dataset("attempts")
           .to_dataframe()
           .to_excel(xls, sheet_name="Implied timescales PCCA {0} clusters".format(n)))

We can now attempt to build coarse-grained hidden Markov state models with different numbers of coarse-grained states and evaluate their respective timescales:

In [None]:
its = {}
for n in [2, 4, 6]:
    its[n] = pe.msm.timescales_hmsm(dtrajs, n, lags=lags, n_jobs=1, errors="bayes", nsamples=100, stride=20)

In [None]:
n_lags = 7
for n in [2, 4, 6]:
    cfl, cfu = its[n].get_sample_conf()
    mits = its[n].get_timescales()
    fig = plt.figure(figsize=(4, 4))
    ax = fig.add_subplot(111)
    for i in range(n - 1):
        ax.semilogy(its[n].lags[:n_lags] * dt, mits[:, i][:n_lags] * dt, marker="o", linewidth=1.5, color=colors[i])
        ax.plot(its[n].lags[:n_lags] * dt, its[n].sample_mean[:, i][:n_lags] * dt, marker="o",
                linewidth=1.5, color=colors[i], linestyle="dashed")
        ax.fill_between(its[n].lags[:n_lags] * dt, cfl[:, i][:n_lags] * dt, cfu[:, i][:n_lags] * dt,
                        interpolate=True, color=colors[i], alpha=0.2)
    ax.plot(its[n].lags[:n_lags] * dt, its[n].lags[:n_lags] * dt, color="k")
    ax.fill_between(its[n].lags[:n_lags] * dt, ax.get_ylim()[0] * np.ones(its[n].lags[:n_lags].shape[0]),
                    its[n].lags[:n_lags] * dt, color="k", alpha=0.2)
    ax.set_yticks(magnitudes([0, 1, 2, 3, 4, 5]))
    ax.set_ylim(1, 100000)
    sns.despine(ax=ax)
    ax.set_xlabel(r"$\tau$ [ns]", fontsize=24)
    ax.set_ylabel(r"$t_i$ [ns]", fontsize=24)
    ax.tick_params(labelsize=24)
    fig.savefig("figs/its-trad-hmm-{2}-{0}-{1}-km.pdf".format(n_clust, n_tic, n),
                bbox_inches="tight", transparent=True)
    fig.savefig("figs/its-trad-hmm-{2}-{0}-{1}-km.svg".format(n_clust, n_tic, n),
                bbox_inches="tight", transparent=True)

In [None]:
with pd.ExcelWriter("intermediate/si-fig-2-b-c-d.xlsx") as xls:
    for n in [2, 4, 6]:
        da = xr.DataArray(data=np.stack([its[n].get_timescales(), its[n].sample_mean, its[n].sample_std]),
                          dims=("kind", "lagtimes", "timescales"),
                          coords=dict(kind=["timescales", "sample mean", "sample std"]),
                          name="Implied timescales")
        (da.to_dataset("lagtimes")
           .to_dataframe()
           .to_excel(xls, sheet_name="HMM timescales {0} clusters".format(n)))