In [None]:
# Papermill params
ratio = 0.9          # Train-Test split ratio
attempts = 20        # Number of times to run
width = 256
depth = 5
learning_rate = 5e-2
dropout = 0.0
regularization = 1e-8

# Neural network

In this notebook we set up the neural networks with VAMPNet scoring functions and train them for different output sizes and estimate errors by bootstrap aggregation. This notebook can be used with `papermill` to run all cells automatically with given parameters. We first define the imports and useful utility functions.

In [None]:
%run model.py

## Data
### Trajectories
Trajectories were acquired in five rounds of 1024 simulations each, totalling 5119 runs (one simulation failed to run) at 278 K in the $NVT$ ensemble. Postprocessing involved removing water, subsampling to 250 ps timesteps, and making molecules whole.

In [None]:
trajs = (sorted(glob("trajectories/r1/traj*.xtc")) +
         sorted(glob("trajectories/r2/traj*.xtc")) +
         sorted(glob("trajectories/r3/traj*.xtc")) +
         sorted(glob("trajectories/r4/traj*.xtc")) +
         sorted(glob("trajectories/r5/traj*.xtc")))
top = "trajectories/topol.gro"
KBT = 2.311420 # 278 K

# This is only really necessary for the residues in the plots
topo = md.load_topology(top)

We use minimum distances as features for the neural network:

In [None]:
feat = pe.coordinates.featurizer(top)
feat.add_residue_mindist()
inpcon = pe.coordinates.source(trajs, feat)

# Uncomment for full version:
# lengths = sort_lengths(inpcon.trajectory_lengths(), [1024, 1023, 1024, 1024, 1024])
lengths = inpcon.trajectory_lengths()
nframes = inpcon.trajectory_lengths().sum()

In [None]:
print("Trajectories: {0}".format(len(trajs)))
print("Frames: {0}".format(nframes))
print("Time: {0:5.3f} µs".format(inpcon.trajectory_lengths().sum() * 0.00025))

## VAMPNet
VAMPNet[1] is composed of two lobes, one reading the system features $\mathbf{x}$ at a timepoint $t$ and the other after some lag time $\tau$. In this case the network reads all minimum inter-residue distances (780 values) and sends them through 5 layers with 256 nodes each. The final layer uses between 2 and 8 *softmax* outputs to yield a state assignment vector $\chi: \mathbb{R}^m \to \Delta^{n}$ where $\Delta^{n} = \{ s \in \mathbb{R}^n \mid 0 \le s_i \le 1, \sum_i^n s_i = 1 \}$ representing the probability of a state assignment. One lobe thus transforms a system state into a state occupation probability. We can also view this value as a kind of reverse ambiguity, i.e. how sure the network is that the system is part of a certain cluster. These outputs are then used as the input for the VAMP scoring function. We use the new enhanced version with physical constraints[2], particularly the ones for positive entries and reversibility.

[1] Mardt, A., Pasquali, L., Wu, H. & Noé, F. VAMPnets for deep learning of molecular kinetics. Nat Comms 1–11 (2017). doi:10.1038/s41467-017-02388-1

[2] Mardt, A., Pasquali, L., Noé, F. & Wu, H. Deep learning Markov and Koopman models with physical constraints. arXiv:1912.07392 [physics] (2019).

### Data preparation
We use minimum residue distances as input ($\frac{N(N-1)}{2}$ values, where $N$ is the number of residues) and first normalize the data:

In [None]:
input_flat = np.load("intermediate/mindist-780-mini.npy")
# Uncomment for full version:
# input_data = unflatten(input_flat, lengths)
input_data = unflatten(input_flat, [lengths])

### Neural network hyperparameters
To allow for a larger hyperparameter search space, we use the self-normalizing neural network approach by Klambauer *et al.* [2], thus using SELU units, `AlphaDropout` and normalized `LeCun` weight initialization. The other hyperparameters are defined at the beginning of this notebook.

[2] Klambauer, G., Unterthiner, T., Mayr, A. & Hochreiter, S. Self-Normalizing Neural Networks. arXiv.org cs.LG, (2017).

In [None]:
activation = "selu"              # NN activation function
init = "lecun_normal"            # NN weight initialization
lag = 20                         # Lag time
n_epoch = 100                    # Max. number of epochs
n_epoch_s = 10000                # Max. number of epochs for S optimization
n_batch = 5000                   # Training batch size
n_dims = input_data[0].shape[1]  # Input dimension
nres = 42                        # Number of residues
epsilon = 1e-7                   # Floating point noise
dt = 0.25                        # Trajectory timestep in ns

outsizes = np.array([2, 3, 4, 5, 6, 7, 8])
lags = np.array([1, 2, 5, 10, 20, 50, 100])

### Run
We run the training several times with different train/test splits to get an error estimate, this is referred to as bootstrap aggregating (*bagging*).

In [None]:
generator = DataGenerator(input_data, ratio=ratio, dt=dt, max_frames=1000)
for i in range(attempts):
    generator.save("models/model-idx-{0}.hdf5".format(i))
    for n in outsizes:
        print("Training n={0} i={1}/{2}".format(n, i + 1, attempts))
        koop = KoopmanModel(n=n, network_lag=lag, verbose=1, nnargs=dict(
            width=width, depth=depth, learning_rate=learning_rate,
            regularization=regularization, dropout=dropout,
            batchnorm=True, lr_factor=1e-2))
        koop.fit(generator)
        koop.save("models/model-ve-{0}-{1}.hdf5".format(n, i))
        del koop
        gc.collect()
    generator.regenerate_indices()