In [None]:
import numpy as np
import pandas as pd
import torch
from rdkit import Chem
from rdkit.Chem import PandasTools
import zipfile
from io import BytesIO

import selfies as sf

import sys
sys.path.append("..")
import moses
from moses.vae import VAE
from moses.vae_property import VAEPROPERTY
from moses.utils import CharVocab, StringDataset, SELFIESVocab
from moses.vae.trainer import VAETrainer
from moses.vae_property.trainer import VAEPROPERTYTrainer 

from moses.metrics import QED, SA, logP
from moses.utils import get_mol

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
folder_path = "../checkpoints/ZINC_vae_property_20240606_025246"
config = torch.load(f'{folder_path}/vae_property_config.pt')
vocab = torch.load(f'{folder_path}/vae_property_vocab.pt')

print(f"Use Selfies: {config.use_selfies}")

In [None]:
fig, axes = plt.subplots(1, 5, figsize=(20, 4))

for i, epoch in enumerate(['00', 20, 40, 60, 80]):
# for i, epoch in enumerate(['00', 20, 40]):
    
    model_path = f'{folder_path}/vae_property_model_0{epoch}.pt'

    train_data = moses.get_dataset('train', config)[:50000]
    
    model = VAEPROPERTY(vocab, config)
    model.load_state_dict(torch.load(model_path))
    
    trainer = VAEPROPERTYTrainer(config)
    sample_loader = trainer.get_dataloader(model, train_data, shuffle=False)
    
    
    model.eval()

    z_list = []
    y_list = []
    for step, batch in enumerate(sample_loader):
        if len(batch[0]) == config.n_batch:
            input_batch = tuple(data.to(model.device) for data in batch[0])
            y = batch[1]
            mu, z, kl_loss = model.forward_encoder(input_batch)
            z = mu.detach().cpu().numpy()
            z_list.extend(z)
            y_list.append(np.array(y).squeeze())

    z_list = np.array(z_list).squeeze()
    y_list = np.array(y_list)
    y_list = y_list.squeeze()
    
    
    # z_viz = TSNE(n_components=2).fit_transform(z_list)

    viz = PCA(n_components=2)
    z_viz = viz.fit_transform(z_list)
    explained_variance = viz.explained_variance_ratio_
    print(f"(Epoch {epoch})Explained variance: {explained_variance}")


    # print(z_viz.shape)
    z_viz = MinMaxScaler().fit_transform(z_viz)

    scatter = axes[i].scatter(z_viz[:,0], z_viz[:,1], c=y_list, cmap='viridis', marker='.', s=10, alpha=0.5, edgecolors='none')
    axes[i].set_title(f'Epoch {epoch}')
    axes[i].set_xlabel('PC1')
    axes[i].set_ylabel('PC2')
    # axes[i].text(0.75, 0.2, f"Explained (PC1): {round(explained_variance[0],2)}", transform=axes[i].transAxes)
    # axes[i].text(0.75, 0.1, f"Explained (PC2): {round(explained_variance[1],2)}", transform=axes[i].transAxes)
    fig.colorbar(scatter, ax=axes[i])
    
plt.tight_layout()
plt.show()