In [None]:
import sys
sys.path.append("..")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from scipy.stats import percentileofscore

import torch
from sklearn.gaussian_process import GaussianProcessRegressor

from moses.vae import VAE
from moses.vae_property import VAEPROPERTY
from moses.vae.trainer import VAETrainer
from moses.vae_property.trainer import VAEPROPERTYTrainer 

from moses.metrics import QED, SA, logP
from moses.utils import get_mol


from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit import rdBase
#from rdkit import RDLogger

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

rdBase.DisableLog('rdApp.*')

## GPR

In [None]:
use_sample = 500

In [None]:
train_df = pd.read_csv("../checkpoints/opimize_gpr/gpr_fit_ZINC250K_df.csv")[:use_sample]
test_df = pd.read_csv("../checkpoints/opimize_gpr/gpr_test_ZINC250K_df.csv")
start_df = pd.read_csv("../checkpoints/opimize_gpr/opt_start_ZINC250K_df.csv")

In [None]:
print(f'gpr train: {train_df.shape}')
print(f'gpr test: {test_df.shape}')
print(f'gpr start: {start_df.shape}')

### Choose model

In [None]:
model_name = 'VAEProp_obj_w0.1'
folder_path = "../checkpoints/ZINC250K_vae_property_obj_proploss_w0.1"
config = torch.load(f'{folder_path}/vae_property_config.pt')
vocab = torch.load(f'{folder_path}/vae_property_vocab.pt')

print(f"Use Selfies: {config.use_selfies}")
print(config.reg_prop_tasks)

In [None]:
cols = ['SELFIES' if config.use_selfies else 'SMILES', 'logP', 'qed', 'SAS', 'obj']
train_data = train_df[cols].values
test_data = test_df[cols].values
start_data = start_df[cols].values


model_path = f'{folder_path}/vae_property_model_080.pt'


model = VAEPROPERTY(vocab, config)
model.load_state_dict(torch.load(model_path))

trainer = VAEPROPERTYTrainer(config)
train_loader = trainer.get_dataloader(model, train_data, shuffle=False)
test_loader = trainer.get_dataloader(model, test_data, shuffle=False)
start_loader = trainer.get_dataloader(model, start_data, shuffle=False)

In [None]:
model.eval()

x_list = []
z_list = []
mu_list = []
logvar_list = []
y_list = []


# y_list = y_list.squeeze()

for step, batch in enumerate(train_loader):
    x = batch[0]
    y = batch[1]
    x_list.extend(x)
    y_list.extend(np.array(y).squeeze())

    mu, logvar, z, _ = model.forward_encoder(x)
    z_list.extend(z.detach().cpu().numpy())
    mu_list.extend(mu.detach().cpu().numpy())
    logvar_list.extend(logvar.detach().cpu().numpy())



y_list = np.array(y_list).squeeze()
GP_train_y = y_list.reshape(-1, y_list.shape[-1])

train_data_df = pd.DataFrame(GP_train_y, columns=['logP', 'qed', 'SAS', 'obj'])
train_data_df = pd.concat([train_data_df , pd.DataFrame({'z': z_list, 'mu': mu_list, 'logvar': logvar_list})], axis=1)
train_data_df.insert(0, 'SELFIES' if config.use_selfies else 'SMILES', [vocab.ids2string(point.cpu().detach().numpy()) for point in x_list])

In [None]:
model.eval()

test_x_list = []
test_z_list = []
test_mu_list = []
test_logvar_list = []
test_y_list = []


# y_list = y_list.squeeze()

for step, batch in enumerate(test_loader):
    x = batch[0]
    y = batch[1]
    test_x_list.extend(x)
    test_y_list.extend(np.array(y).squeeze())

    mu, logvar, z, _ = model.forward_encoder(x)
    test_z_list.extend(z.detach().cpu().numpy())
    test_mu_list.extend(mu.detach().cpu().numpy())
    test_logvar_list.extend(logvar.detach().cpu().numpy())


test_y_list = np.array(test_y_list).squeeze()
GP_test_y = y_list.reshape(-1, test_y_list.shape[-1])

test_data_df = pd.DataFrame(GP_test_y, columns=['logP', 'qed', 'SAS', 'obj'])
test_data_df = pd.concat([test_data_df , pd.DataFrame({'z': test_z_list, 'mu': test_mu_list, 'logvar': test_logvar_list})], axis=1)
test_data_df.insert(0, 'SELFIES' if config.use_selfies else 'SMILES', [vocab.ids2string(point.cpu().detach().numpy()) for point in test_x_list])

In [None]:
GP_Train_x = train_data_df.z.values
GP_Test_x = test_data_df.z.values

GP_Train_y = train_data_df['obj'].values
GP_Test_y = test_data_df['obj'].values

In [None]:
from bayes_opt import BayesianOptimization
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C

In [None]:
# input: z
# output: obj

from moses.metrics import QED, SA, logP


def objective_function(z): #ground truth
    # kernel = C * RBF(RBF_length_scale)
    # gp = GaussianProcessRegressor(kernel=kernel)
    # gp.fit(GP_train_x, GP_train_y)
    # predictions = gp.predict(GP_test_x)
    
    gen = model.sample(len(z), max_len=100, z=z, temp=1.0, analysis=True)
    gen_df = pd.DataFrame(gen, columns=['gen_SELFIES' if config.use_selfies else 'gen_SMILES'])
    print(gen_df)
    
    if config.use_selfies:
        gen_df['gen_SMILES'] = [sf.decoder(x) for x in gen_df['gen_SELFIES']]
        mol = gen_df['gen_SELFIES'].apply(Chem.MolFromSmiles)
    else:
        mol = gen_df['gen_SMILES'].apply(Chem.MolFromSmiles)
    qed_list = []
    sa_list = []
    
    for gen_mol in mol.values:
        
        try:
            qed = QED(gen_mol)
            sa = SA(gen_mol)
            
        except:
            pass
        
        qed_list.append(qed)
        sa_list.append(sa)
    
    gen_df['gen_qed'] = qed_list
    gen_df['gen_SAS'] = sa_list
    
    obj = 5 * gen_df['gen_qed'] - gen_df['gen_SAS']
    print(obj.values)
    return z - 3

In [None]:
# Defining bounds for the hyperparameters
pbounds = {'z': (-100, 100)
}

optimizer = BayesianOptimization(
    f=objective_function(z),
    pbounds=pbounds,
    random_state=42
)

optimizer.maximize(
    init_points=2,  # Random exploratory steps
    n_iter=10       # Steps of Bayesian Optimization
)

In [None]:
best_params = optimizer.max['params']
best_params

In [None]:
# Load the model with optimized parameters
optimized_kernel = C(best_params['C']) * RBF(best_params['RBF_length_scale'])
optimized_gp = GaussianProcessRegressor(kernel=optimized_kernel)
optimized_gp.fit(GP_train_x, GP_train_y)

# Making predictions
predictions = optimized_gp.predict(GP_train_x)
print(predictions)

## Latent Vector Interpolation

In [None]:
import torch
import numpy as np
import pandas as pd
from viz_utils import slerp, InterpolationLoader, z_to_smiles

In [None]:
model_type = 'vae_property' # 'vae_property', 'vae'
data_type = 'selfies'  # 'selfies'
# data_type = 'smiles'
steps = 4
epoch = 60
sample_1 = 3
sample_2 = 5

In [None]:
z_list, y_list, _, train_data, model = InterpolationLoader(dataPATH="../moses/dataset/data/ZINC250K/",
                                                    model_type=model_type,
                                                    data_type=data_type,
                                                    best_epoch=epoch,
                                                    i_1=sample_1, i_2=sample_2,
                                                    )

original_mol = train_data[:,0]

interpolated_latents = torch.tensor(np.array([slerp(val, z_list[0,:], z_list[1,:]) for val in np.linspace(0, 1, steps)]))
viz_df = z_to_smiles(model, original_mol, interpolated_latents,
                     data_type=data_type, steps=steps,
                     temp=0.3, argmax=False)

In [None]:
if data_type == 'smiles':
    result_mol = viz_df['SMILES'].values
else:
    result_mol = viz_df["SELFIES"].values
    
result_mol

## Latent Space Vizualization

In [None]:
data_type = 'smiles' # 'selfies'
# data_type = 'selfies' # 'selfies'

In [None]:
train_df = pd.read_csv("../moses/dataset/data/ZINC250K/train.csv")
test_df = pd.read_csv("../moses/dataset/data/ZINC250K/test.csv")

In [None]:
if data_type == 'selfies':
    folder_path = "../checkpoints/ZINC250K_vae_property_obj_proploss_w0.1_selfies"
else:
    folder_path = "../checkpoints/ZINC250K_vae_property_obj_proploss_w0.1_smiles"

    
config = torch.load(f'{folder_path}/vae_property_config.pt')
vocab = torch.load(f'{folder_path}/vae_property_vocab.pt')

print(f"Use Selfies: {config.use_selfies}")
print(config.reg_prop_tasks)

cols = ['SELFIES' if config.use_selfies else 'SMILES', 'logP', 'qed', 'SAS', 'obj']
train_data = train_df[cols].values
test_data = test_df[cols].values

model_path = f'{folder_path}/vae_property_model_080.pt'

model = VAEPROPERTY(vocab, config)
model.load_state_dict(torch.load(model_path))

trainer = VAEPROPERTYTrainer(config)
train_loader = trainer.get_dataloader(model, train_data, shuffle=False)
test_loader = trainer.get_dataloader(model, test_data, shuffle=False)

In [None]:
fig, axes = plt.subplots(1, 6, figsize=(30, 4))

for i, epoch in enumerate(['00', 20, 40, 60, 80, 'final']):
    
    model_path = f'{folder_path}/vae_property_model_0{epoch}.pt'
    
    if epoch == 'final':
        model_path = f'{folder_path}/vae_property_model.pt'
        
    model = VAEPROPERTY(vocab, config)
    model.load_state_dict(torch.load(model_path))
    
    model.eval()

    x_list = []
    z_list = []
    mu_list = []
    logvar_list = []
    y_list = []

    for step, batch in enumerate(train_loader):
        x = batch[0]
        y = batch[1]
        x_list.extend(x)
        y_list.extend(np.array(y).squeeze())

        mu, logvar, z, _ = model.forward_encoder(x)
        z_list.extend(z.detach().cpu().numpy())
        mu_list.extend(mu.detach().cpu().numpy())
        logvar_list.extend(logvar.detach().cpu().numpy())

    viz = PCA(n_components=2)
    z_viz = viz.fit_transform(mu_list)
    explained_variance = viz.explained_variance_ratio_
    print(f"(Epoch {epoch})Explained variance: {explained_variance}")
    
    y_list = np.array(y_list)[:, -1]
    
    # print(z_viz.shape)
    z_viz = MinMaxScaler().fit_transform(z_viz)

    scatter = axes[i].scatter(z_viz[:, 0], z_viz[:, 1], c=y_list, cmap='viridis', marker='.', s=10, alpha=0.5, edgecolors='none')

    axes[i].set_title(f'Epoch {epoch}')
    axes[i].set_xlabel('PC1')
    axes[i].set_ylabel('PC2')
    
    fig.colorbar(scatter, ax=axes[i])
    
plt.tight_layout()
plt.show()