# Preprocessing

## Featurizing

In [1]:
import mdtraj as md
import numpy as np
import datetime
import tensorflow as tf
import nglview as nv


from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

from utils import split_dataset
from vae import build_asmsa_vae



2025-07-31 14:14:28.539911: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-31 14:14:28.554980: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-31 14:14:28.561592: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-31 14:14:28.573534: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




In [2]:
tr = "trpcage_ds_nH.xtc"
conf = "trpcage_npt400_nH.pdb"

traj = md.load_xtc(tr, top=conf)
backbone_atoms = traj.topology.select('backbone')
traj.superpose(traj, 0, atom_indices=backbone_atoms)


<mdtraj.Trajectory with 50001 frames, 144 atoms, 20 residues, and unitcells at 0x7f9f7ef86cb0>

In [3]:
view = nv.show_mdtraj(traj)

view.add_representation('line', selection='protein')
view

NGLWidget(max_frame=50000)

In [4]:
n_frames, n_atoms = traj.n_frames, traj.n_atoms #50001, 144

In [5]:
p_indices = traj.topology.select("protein")
n_p = len(p_indices)

bb_indices = traj.topology.select("backbone")
n_bb = len(bb_indices)

ca_indices = traj.topology.select("name CA")
pairs = np.array([(i, j) for idx,i in enumerate(ca_indices) 
                          for j in ca_indices[idx+1:]])

coords_bb = traj.xyz[:,bb_indices,:]
#coords = traj.xyz.reshape(n_frames, n_atoms * 3) #from (n_frame, n_atoms, 3) to (n_frame, n_atoms*3) 
coords = coords_bb.reshape(n_frames, -1)

dists = md.compute_distances(traj, pairs) 

bonds = list(traj.topology.bonds)
bond_pairs = [[b.atom1.index, b.atom2.index] for b in bonds]
bond_lengths = md.compute_distances(traj, bond_pairs)


phi_angles = md.compute_phi(traj)[1]
psi_angles = md.compute_psi(traj)[1]
phi_sin = np.sin(phi_angles)
phi_cos = np.cos(phi_angles)  
psi_sin = np.sin(psi_angles)
psi_cos = np.cos(psi_angles)

# Side chain dihedrals with sin/cos
chi1_angles = md.compute_chi1(traj)[1]
chi2_angles = md.compute_chi2(traj)[1]
chi1_sin = np.sin(chi1_angles)
chi1_cos = np.cos(chi1_angles)
chi2_sin = np.sin(chi2_angles) 
chi2_cos = np.cos(chi2_angles)


In [6]:
feat = np.concatenate([coords,phi_sin,phi_cos,psi_sin,psi_cos,chi1_sin,chi1_cos,chi2_sin,chi2_cos], axis=1)

In [7]:
scaler = MinMaxScaler()
features_normalized = scaler.fit_transform(feat)
features_normalized.shape

(50001, 366)

## NN preprocessing

In [8]:
# Uso:
ds_train, ds_val, ds_test, ds_all = split_dataset(features_normalized, train_size=70, val_size=15, batch_size=64, seed=42)

# Opzionale: Data Augmentation per autoencoder
def add_data_augmentation(ds_train, noise_factor=0.1):
    """
    Aggiunge rumore ai dati di input mantenendo il target pulito
    """
    def add_noise(x, y):
        noise = tf.random.normal(tf.shape(x), stddev=noise_factor)
        x_noisy = x + noise
        x_noisy = tf.clip_by_value(x_noisy, 0.0, 1.0)  # Assumendo dati normalizzati [0,1]
        return x_noisy, y  # Input rumoroso, target pulito
    
    return ds_train.map(add_noise, num_parallel_calls=tf.data.AUTOTUNE)

# Per usare data augmentation:
# ds_train = add_data_augmentation(ds_train, noise_factor=0.05)

2025-07-31 14:14:35.302452: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8075 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe MIG 1g.10gb, pci bus id: 0000:61:00.0, compute capability: 8.0


Dataset Statistics:
  Train: 35000 samples, 546 batches
  Val:   7500 samples, 118 batches
  Test:  7501 samples, 118 batches
  Batch size: 64


# VAE

In [9]:
'''
Batch Norm, nel caso, va prima della layer activation)
'''

'\nBatch Norm, nel caso, va prima della layer activation)\n'

In [10]:
latent_dim = 64

vae, encoder, decoder = build_asmsa_vae(feat.shape[1], latent_dim=latent_dim)

In [11]:
log_dir = "logs/autoencoder/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")


callbacks = [
    tf.keras.callbacks.TensorBoard(
    log_dir=log_dir,
    histogram_freq=1,        # salva istogrammi dei pesi ogni epoca
    write_graph=True,        # salva anche il grafo del modello
    update_freq='epoch',     # ogni epoca
    ),

    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=15,  # più pazienza con lr scheduling
        min_delta=1e-5,  # soglia più stretta
        restore_best_weights=True,
        verbose=1,
        mode='min'
    ),

    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=7,
        min_lr=1e-7,
        verbose=1
    ),

    tf.keras.callbacks.ModelCheckpoint(
        filepath=f'best_autoencoder_{latent_dim}d.keras',
        monitor='val_loss',
        save_best_only=True,
        save_weights_only=False,
        verbose=1
    )
    
]

tensorboard --logdir logs/autoencoder --host localhost --port 6006

In [None]:
learning_rate=1e-4
optimizer = tf.keras.optimizers.AdamW(
    learning_rate=learning_rate,
    weight_decay=1e-5, 
    beta_1=0.9,
    beta_2=0.999
    )

vae.compile(optimizer=optimizer)
vae.fit(ds_train, epochs=500, validation_data=ds_val, callbacks=callbacks)

Epoch 1/500


I0000 00:00:1753971281.625185   31675 service.cc:146] XLA service 0x7f992801ff20 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1753971281.625321   31675 service.cc:154]   StreamExecutor device (0): NVIDIA A100 80GB PCIe MIG 1g.10gb, Compute Capability 8.0
2025-07-31 14:14:41.753423: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-07-31 14:14:42.337009: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m 12/546[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 5ms/step - kl_loss: 0.9727 - loss: 1.6669 - reconstruction_loss: 0.6942    

I0000 00:00:1753971285.010194   31675 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m541/546[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - kl_loss: 0.1063 - loss: 0.7901 - reconstruction_loss: 0.6838
Epoch 1: val_loss improved from inf to 0.66318, saving model to best_autoencoder_64d.keras
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 12ms/step - kl_loss: 0.1055 - loss: 0.7891 - reconstruction_loss: 0.6837 - val_kl_loss: 0.0039 - val_loss: 0.6632 - val_reconstruction_loss: 0.6593 - learning_rate: 1.0000e-04
Epoch 2/500


  return saving_lib.save_model(model, filepath)


[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - kl_loss: 0.0028 - loss: 0.6619 - reconstruction_loss: 0.6592
Epoch 2: val_loss improved from 0.66318 to 0.65955, saving model to best_autoencoder_64d.keras
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - kl_loss: 0.0028 - loss: 0.6619 - reconstruction_loss: 0.6592 - val_kl_loss: 0.0010 - val_loss: 0.6595 - val_reconstruction_loss: 0.6585 - learning_rate: 1.0000e-04
Epoch 3/500
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - kl_loss: 8.1530e-04 - loss: 0.6594 - reconstruction_loss: 0.6586
Epoch 3: val_loss improved from 0.65955 to 0.65860, saving model to best_autoencoder_64d.keras
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - kl_loss: 8.1499e-04 - loss: 0.6594 - reconstruction_loss: 0.6586 - val_kl_loss: 3.9356e-04 - val_loss: 0.6586 - val_reconstruction_loss: 0.6582 - learning_rate: 1.0000e-04
Epoch 4/500
[1m541/546[0m 

# Decode and visualize

In [None]:
autoencoder = load_model(f'best_autoencoder_2d.keras')
encoder = autoencoder.get_layer("encoder")
decoder = autoencoder.get_layer("decoder")

In [None]:
import matplotlib.pyplot as plt
def plot_latent_space(latent_dim, encoder, dataset, conf, target, cmap='rainbow', figsize=(8,8)):
    """
    Computes latent embeddings and plots them in 2D.

    Args:
        encoder: the encoder model returning [z_mean, z_log_var, z]
        dataset: input data or tf.data.Dataset yielding inputs (and optionally labels)
        labels: optional array-like of same length as dataset for coloring
        cmap: matplotlib colormap
        figsize: tuple for figure size
    """
    # Get embeddings
    results = encoder.predict(dataset)
    # results = [z_mean, z_log_var, z]
    emb = np.array(results[2])  # results[2] use sampled z; shape (N,2)

    rms_ref = md.load_pdb(conf)
    rms_ref_bb   = rms_ref.atom_slice(bb_indices)
    rms_tr = md.load_xtc(tr, top=rms_ref)
    rmsd = md.rmsd(rms_tr, rms_ref)

    #z = np.random.normal(loc=0.0, scale=1.0, size=(latent_dim,))

    dists = np.linalg.norm(emb - target, axis=1)
    # 4a. Se vuoi, ad esempio, le K righe più vicine:
    K = 1
    idx_closest = np.argsort(dists)[:K]
    sample = emb[idx_closest].reshape(1, latent_dim)
    
    plt.figure(figsize=figsize)

    plt.scatter(emb[:,0], emb[:,1], c=rmsd,s=0.5, cmap=cmap)
    plt.scatter(sample[:,0], sample[:,1], marker="X", c="Black")

    plt.show()
    return emb, sample

In [None]:
latent_dim = 2
target = np.array([0,0]).reshape(1, 2)
emb, sample = plot_latent_space(latent_dim, encoder, ds_all, conf, target)

In [None]:
emb.shape

rms_ref = md.load_pdb(conf)
rms_ref_bb   = rms_ref.atom_slice(bb_indices)
rms_tr = md.load_xtc(tr, top=rms_ref)
rmsd = md.rmsd(rms_tr, rms_ref)

In [None]:
s = decoder.predict(sample)
s_orig = scaler.inverse_transform(s)

coords_flat = s_orig[0, :coords.shape[1]]                    
coords_recons = coords_flat.reshape((n_bb, 3))

new_traj = md.Trajectory(
    xyz=np.array([coords_recons]),     
    topology=rms_ref_bb.topology     
)


new_traj.save_pdb("reconstructed.pdb")

import nglview as nv

view = nv.show_file('reconstructed.pdb')
view.clear_representations()
view.add_line() 
#view.add_cartoon()
view.center()
view