In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
tf.enable_eager_execution()
import numpy as np
import os
import time
# from lossT import sparse_categorical_crossentropy

### Parameters

In [41]:
# Spatially discretized data into 20 bins
bins=np.arange(-0.9, 1.1, 0.1)
num_bins=len(bins)
# Labels of all possible states in the ranges we considered.
# For 2d systems, this is not the same as the number of representative values.
all_combs = [i for i in range(num_bins)]
vocab=sorted(all_combs)
vocab_size = len(vocab)

# Sequence length and shift in step between past (input) & future (output)
seq_length = 100
shift=1

# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset.
BUFFER_SIZE = 50000

# Model parameters
embedding_dim = 128
rnn_units = 1024

# Training epochs
EPOCHS=40

# Prediction
num_generate = 2000000
# Low temperatures results in more predictable text.
# Higher temperatures results in more surprising text.
# Experiment to find the best setting.
temperature = 1.0

In [44]:
def split_input_target(chunk):
    """
    split sequences into input and target.
    """
    input_text = chunk[:-shift]
    target_text = chunk[shift:]
    return input_text, target_text

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    
    rnn(rnn_units,
        return_sequences=True,
        recurrent_initializer='glorot_uniform',
        stateful=True),

    tf.keras.layers.Dense(vocab_size)
    ])

    return model

def loss(labels, logits):

    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
#     return sparse_categorical_crossentropy(labels, logits, from_logits=True)

def generate_text(pmodel, num_generate, temperature, start_string):
    """
    # Define function for generating prediction.
    """

    # Converting the start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store the results
    text_generated = np.empty(1)

    # Here batch size = 1
    pmodel.reset_states()
    for i in range(num_generate):
        
        predictions = pmodel(input_eval)
        
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)
        
        # using a multinomial distribution to predict the word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()
        
        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)
        
        text_generated = np.vstack((text_generated, idx2char[predicted_id].tolist()))
        
    return text_generated

### Read data

In [33]:
infile = 'DATA_aladip/COLVAR_T450'
phi, psi=np.loadtxt(infile, unpack=True, usecols=(1,2), skiprows=7)

cos_phi=np.cos(phi)
sin_phi=np.sin(phi)
cos_psi=np.cos(psi)
sin_psi=np.sin(psi)

# Spatially discretized data
idx_sin_phi=np.digitize(sin_phi, bins)
idx_sin_psi=np.digitize(sin_psi, bins)

### Training data

In [34]:
idx_2d=list(idx_sin_phi[:10000])
text = idx_2d

char2idx = {u:i for i, u in enumerate(vocab)} # Mapping from characters to indices
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

sequences = char_dataset.batch(seq_length+shift, drop_remainder=True)
dataset = sequences.map(split_input_target)
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

### Use the same trajectory as the validation data

In [35]:
idx_sin_phi_v=np.digitize(sin_phi, bins)
idx_2dv=list(idx_sin_phi_v)

vali = idx_2dv[:200000]
vali_as_int = np.array([char2idx[c] for c in vali])

# Create validation examples/targets
vali_dataset = tf.data.Dataset.from_tensor_slices(vali_as_int)

sequences = vali_dataset.batch(seq_length+shift, drop_remainder=True)
vdataset = sequences.map(split_input_target)
vdataset = vdataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

### Read the same trajectory and use the first few to activate the model for prediction

In [36]:
idx_sin_phi_p=np.digitize(sin_phi, bins)
idx_2dp=list(idx_sin_phi_p)
text4activation = idx_2dp[:100000]

### Decide whether to use GPU and build model of training

In [37]:
if tf.test.is_gpu_available():
    rnn = tf.keras.layers.CuDNNLSTM
else:
    import functools
    rnn = functools.partial(
    tf.keras.layers.LSTM, recurrent_activation='sigmoid')
    
    model = build_model(vocab_size = vocab_size,
        embedding_dim=embedding_dim,
        rnn_units=rnn_units,
        batch_size=BATCH_SIZE)

print(model.summary())

model.compile(optimizer = tf.train.AdamOptimizer(), loss = loss)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (64, None, 128)           2560      
_________________________________________________________________
lstm_4 (LSTM)                (64, None, 1024)          4722688   
_________________________________________________________________
dense_4 (Dense)              (64, None, 20)            20500     
Total params: 4,745,748
Trainable params: 4,745,748
Non-trainable params: 0
_________________________________________________________________
None


### Save checkpoint

In [38]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

### Training

In [None]:
examples_per_epoch = len(text)//(seq_length+shift)
steps_per_epoch = examples_per_epoch//BATCH_SIZE

v_examples=len(vali_as_int)//(seq_length+shift)
v_steps_per_epoch=v_examples//BATCH_SIZE

history = model.fit(dataset.repeat(EPOCHS), epochs=EPOCHS, steps_per_epoch=steps_per_epoch, validation_data=vdataset.repeat(EPOCHS), validation_steps=v_steps_per_epoch, callbacks=[checkpoint_callback])

In [42]:
# Rebuild model with batch_size=1:
pmodel = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
pmodel.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
pmodel.build(tf.TensorShape([1, None]))
print(pmodel.summary())

# Print the length of seed for activating the model
print('length of seed: {}'.format(len(text4activation)))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (1, None, 128)            2560      
_________________________________________________________________
lstm_5 (LSTM)                (1, None, 1024)           4722688   
_________________________________________________________________
dense_5 (Dense)              (1, None, 20)             20500     
Total params: 4,745,748
Trainable params: 4,745,748
Non-trainable params: 0
_________________________________________________________________
None
length of seed: 1000


### Generate prediction sequentially

In [None]:
start0 = time.time()
prediction=generate_text(pmodel, num_generate, temperature, start_string=text4activation)
print ('Time taken for total {} sec\n'.format(time.time() - start0))

### Save prediction

In [None]:
np.savetxt('prediction',prediction[1:])