# Preprocessing

## Featurizing

In [None]:
import mdtraj as md
import numpy as np
import datetime
import tensorflow as tf
import nglview as nv
import os, sys


from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.models import load_model

%cd /home/jovyan/ASMSA/mydev


repo_dir = os.getcwd()   
if repo_dir not in sys.path:
    sys.path.insert(0, repo_dir)

from src.utils import plot_latent_space
from src.asmsa_callbacks import callbacks
from src.ae import asmsa_ae
from src.asmsa_features import process_trajectory
from src.asmsa_split import asmsa_datasets

nn_model = 'ae'
latent_dim = 2

In [None]:
tr = "trpcage_ds_nH.xtc"
conf = "trpcage_npt400_nH.pdb"

traj = md.load_xtc(tr, top=conf)
backbone_atoms = traj.topology.select('backbone')
traj.superpose(traj, 0, atom_indices=backbone_atoms)


In [None]:
view = nv.show_mdtraj(traj)

view.add_representation('line', selection='protein')
view

In [None]:
feat = process_trajectory(tr, conf)


## NN preprocessing

In [None]:
# Uso:
ds_train, ds_val, ds_test, ds_all, info = asmsa_datasets(feat['features_normalized'], train_size=70, val_size=15, batch_size=64, seed=42)

# AE

In [None]:
'''
Batch Norm, nel caso, va prima della layer activation)
'''

In [None]:
autoencoder, encoder, decoder = asmsa_ae(
    n_features=feat['features_normalized'].shape[1],
    latent_dim=latent_dim)

autoencoder.summary()


In [None]:
mse_fn = tf.keras.losses.MeanSquaredError()
mae_fn = tf.keras.losses.MeanAbsoluteError()

# Definisco la loss ricostruzione pesata
def recon_loss(y_true, y_pred):
    mse = mse_fn(y_true, y_pred)
    mae = mae_fn(y_true, y_pred)
    return 0.8 * mse + 0.2 * mae

In [None]:
log_dir = "logs/autoencoder/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
cb = callbacks(log_dir, latent_dim, monitor="val_loss", model=nn_model)  

learning_rate = 1e-4
optimizer = tf.keras.optimizers.AdamW(
    learning_rate=learning_rate,
    weight_decay=1e-5, 
    beta_1=0.9,
    beta_2=0.999
)

# Assuming asmsa_ae function exists
ae, encoder, decoder = asmsa_ae(
    n_features=feat['features_normalized'].shape[1],
    latent_dim=latent_dim
)



# Compilo l'autoencoder con la loss personalizzata
ae.compile(optimizer=optimizer,
           loss=recon_loss)


tensorboard --logdir logs/autoencoder --host localhost --port 6006

In [None]:
ae.fit(ds_train,epochs=200,validation_data=ds_val,callbacks=cb)

In [None]:
src = f"ae_{latent_dim}d.keras"
dest = "/home/tedeschg/prj/ASMSA/mydev/models/"

!mv {src} {dest}

# Decode and visualize

In [None]:
path = f"/home/tedeschg/prj/ASMSA/mydev/models/ae_{latent_dim}d.keras"

autoencoder = load_model(
    path,
    custom_objects={"recon_loss": recon_loss}
)

encoder = autoencoder.get_layer("encoder")
decoder = autoencoder.get_layer("decoder")


In [None]:
for batch_x, _ in ds_test.take(1):
    sample_x = batch_x[30]  
    break
    
sample_x_batch = tf.expand_dims(sample_x, axis=0)
test = encoder.predict(sample_x_batch)

test

In [None]:
target = np.array([0, 0]).reshape(1, latent_dim)
bb_indices = traj.topology.select('backbone')
ca_indices = traj.topology.select('name CA')
emb, sample = plot_latent_space(latent_dim, encoder, ds_all, conf, tr, test, bb_indices, model=nn_model, exact=True)

In [None]:
import numpy as np
from sklearn.manifold import trustworthiness

X_val_np = np.concatenate([x for x, _ in ds_val], axis=0)

z_mean = encoder.predict(X_val_np, batch_size=1024)

mu = z_mean
mu_mean = mu.mean(axis=0)
mu_cov  = np.cov(mu, rowvar=False)
off_diag = mu_cov[~np.eye(2, dtype=bool)]
print("mean:", mu_mean, "var:", np.diag(mu_cov), "corr_offdiag:", off_diag)

tw = trustworthiness(X_val_np, mu, n_neighbors=10)
print("trustworthiness:", tw)  # >0.95 buono



In [None]:


rms_ref = md.load_pdb(conf)
rms_ref_bb  = rms_ref.atom_slice(bb_indices)
rms_ref_ca  = rms_ref.atom_slice(ca_indices)
rms_tr = md.load_xtc(tr, top=rms_ref)
rmsd = md.rmsd(rms_tr, rms_ref)

In [None]:
p_indices = traj.topology.select("protein")
n_p = len(p_indices)

In [None]:
s = decoder.predict(sample)

coords_size = feat['coords'].shape[1]  # dimensione delle coordinate
angles_size = s.shape[1] - coords_size  # dimensione degli angoli

# Separa coordinate e angoli
s_coords = s[:, :coords_size]
s_angles = s[:, coords_size:]

# Inverti le trasformazioni separatamente
coords_orig = feat['scaler_coords'].inverse_transform(s_coords)
angles_orig = feat['scaler_angles'].inverse_transform(s_angles)

# Prendi solo le coordinate per la ricostruzione
coords_flat = coords_orig[0, :]                    
coords_p = coords_flat.reshape((n_p, 3))
mask_bb = np.isin(p_indices, bb_indices)
coords_bb = coords_p[mask_bb] 
coords_ca = coords_bb[1::4] 

new_traj = md.Trajectory(
    xyz=np.array([coords_bb]),     
    topology=rms_ref_bb.topology     
)

new_traj.save_pdb("./models/ae_reconstructed.pdb")


In [None]:
view = nv.show_file('./models/ae_reconstructed.pdb')
view.clear_representations()
view.add_line() 
#view.add_cartoon()
view.center()
view

In [None]:
mse = np.mean((s[0] - sample_x.numpy())**2)
print("MSE ricostruzione:", mse)

In [None]:
from src.asmsa_analysis import analyze_reconstruction, plot_section_errors
import matplotlib.pyplot as plt
# Utilizzo con i tuoi dati
# Assumendo che sample_x e s[0] siano definiti
orig = sample_x.numpy() 
recon = s[0]

# Analisi completa
fig, metrics = analyze_reconstruction(orig, recon, title_prefix="Autoencoder ")

# Solo errori per sezione
fig_sections, section_stats = plot_section_errors(orig, recon, n_sections=25, 
                                                   title="Analisi Errori per Sezione")

plt.show()

# Stampa metriche
print("\n=== METRICHE RICOSTRUZIONE ===")
for metric, value in metrics.items():
    (f"{metric.upper()}: {value:.6f}")