In [None]:
import sys
sys.path.append("..")

import numpy as np
import pandas as pd

from tqdm import tqdm
from scipy.stats import percentileofscore

from moses.vae import VAE
from moses.vae_property import VAEPROPERTY
from moses.vae.trainer import VAETrainer
from moses.vae_property.trainer import VAEPROPERTYTrainer 

from rdkit import rdBase
rdBase.DisableLog('rdApp.*')
import selfies as sf

import torch
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

import seaborn as sns
import matplotlib.pyplot as plt

from viz_utils import *

### Repeat GPR

In [None]:
nan_qed = 0
nan_sa = 5
num_iterations = 50

data_type = 'selfies'
model_name = 'vae_property_obj_proploss_w0.1' # 'vae_property_obj_w0.1'

In [None]:
for data_type in ['smiles', 'selfies']:
    for sample_num in [2000]:
        (GP_Train_x, GP_Train_y, 
        GP_Test_x, GP_Test_y, 
        train_data_df, test_data_df, 
        model, vocab, config) = ready_gpr(sample_num, data_type=data_type, model_name=model_name)
        
        
        for rp in tqdm(range(6)):
            # 초기 데이터
            train_z, train_y = initial_data(GP_Train_x, GP_Train_y)

            # 초기 설정
            bounds = torch.stack([torch.full((train_z.shape[1],), min(train_z.reshape(-1))),
                                torch.full((train_z.shape[1],), max(train_z.reshape(-1)))])

            best_z_list = []
            all_z_list = []
            new_z_step = []

            best_z_list_idx = []
            step_list = []

            best_perform = -np.inf
            save_epoch = 40

            for iter in range(num_iterations):
                gp = train_gp(train_z, train_y)
                new_z = optimize_acq(gp, bounds, train_y)
                new_y = torch.tensor([objective_function(model, config, nan_qed, nan_sa, new_z,
                                                        temp=1.0, test=True)])
                
                if new_y > best_perform:
                    best_perform = new_y
                    best_z_list.append(new_z)
                    best_z_list_idx.append(iter)
                    print("New best z found at iter", iter, ":", "y:", round(float(new_y), 2))
                    
                if iter % save_epoch == 0:
                    new_z_step.append(new_z)
                    step_list.append(iter)
                
                if iter == num_iterations-1:
                    best_z_list.append(new_z)
                    best_z_list_idx.append(iter)
                    new_z_step.append(new_z)
                    step_list.append(iter)
                
                all_z_list.append(new_z)
                
                # 데이터 업데이트
                train_z = torch.cat((train_z, new_z), dim=0)
                train_y = torch.cat((train_y, new_y), dim=0)
                
            best_z_list = reshape_z(best_z_list)
            new_z_step = reshape_z(new_z_step)
            all_z_list = reshape_z(all_z_list)
            
            
            best_df = generate_df(best_z_list, best_z_list_idx, model, config, nan_qed, nan_sa, 
                                  temp=1.0, test=True)
            best_df.to_csv(f"./repeat/rp{rp}/vae_prop_{data_type}_best_n{sample_num}_ep{num_iterations}_rp{rp}.csv", index=False)
            
            
            all_df = generate_df(all_z_list, range(len(all_z_list)), model, config, nan_qed, nan_sa, 
                                 temp=1.0, test=True)
            all_df.to_csv(f"./repeat/rp{rp}/vae_prop_{data_type}_all_n{sample_num}_ep{num_iterations}_rp{rp}.csv", index=False)
            
            
            step_df = generate_df(new_z_step, step_list, model, config, nan_qed, nan_sa, temp=1.0, test=True)

            step_df.to_csv(f"./repeat/rp{rp}/vae_prop_{data_type}_step40_n{sample_num}_ep{num_iterations}_rp{rp}.csv", index=False)          

## GPR

In [None]:
sample_num = 1000
num_iterations = 50
nan_qed = 0
nan_sa = 10

data_type ='selfies'
model_name = 'vae_property_obj_proploss_w0.1' # 'vae_property_obj_w0.1'

In [None]:
(GP_Train_x, GP_Train_y, 
 GP_Test_x, GP_Test_y, 
 train_data_df, test_data_df, 
 model, vocab, config) = ready_gpr(sample_num, data_type=data_type, model_name=model_name)

In [None]:


# 초기 데이터
train_z, train_y = initial_data(GP_Train_x, GP_Train_y)

# 초기 설정
bounds = torch.stack([torch.full((train_z.shape[1],), min(train_z.reshape(-1))),
                      torch.full((train_z.shape[1],), max(train_z.reshape(-1)))])

best_z_list = []
all_z_list = []
new_z_step = []

best_z_list_idx = []
step_list = []

best_perform = -np.inf
save_epoch = 40

for iter in tqdm(range(num_iterations)):
    gp = train_gp(train_z, train_y)
    new_z = optimize_acq(gp, bounds, train_y)
    new_y = torch.tensor([objective_function(model, config, nan_qed, nan_sa, new_z,
                                             temp=1.0, test=True)])
    
    
    
    if new_y > best_perform:
        best_perform = new_y
        best_z_list.append(new_z)
        best_z_list_idx.append(iter)
        print("New best z found at iter", iter, ":", "y:", round(float(new_y), 2))
        
    if iter % save_epoch == 0:
        new_z_step.append(new_z)
        step_list.append(iter)
    
    if iter == num_iterations-1:
        best_z_list.append(new_z)
        best_z_list_idx.append(iter)
        new_z_step.append(new_z)
        step_list.append(iter)
    
    all_z_list.append(new_z)
    
    # 데이터 업데이트
    train_z = torch.cat((train_z, new_z), dim=0)
    train_y = torch.cat((train_y, new_y), dim=0)
    
print(f"최적의 z index {train_y.argmax()}:", train_z[train_y.argmax()])
print("최적의 목적 함수 값:", train_y.max().item())

In [None]:
best_z_list = reshape_z(best_z_list)
new_z_step = reshape_z(new_z_step)
all_z_list = reshape_z(all_z_list)

In [None]:
best_df = generate_df(best_z_list, best_z_list_idx, model, config, nan_qed, nan_sa, 
                      temp=1.0, test=True)

best_df.to_csv(f"./after_optim/vae_prop_{data_type}_best_n{sample_num}_ep{num_iterations}.csv", index=False)
# best_df.to_csv(f"./after_optim/vae_{data_type}_best_n{sample_num}_ep{num_iterations}.csv", index=False)

In [None]:
vizualizeMol(best_df, data_type=data_type)

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(best_z_list_idx, best_df.obj.values)
plt.title('Updated Objective Function with Best Value')
plt.xlabel('Iteration')
plt.ylabel('Objective Function')
plt.xticks(best_z_list_idx, fontsize=10)
plt.show()

In [None]:
# viz = PCA(n_components=2)
# z_viz = viz.fit_transform(train_z[:500])
# y_list = np.array(train_y[:500])

# explained_variance = viz.explained_variance_ratio_
# z_viz = MinMaxScaler().fit_transform(z_viz)

# scatter = plt.scatter(z_viz[:, 0], z_viz[:, 1], c=y_list, cmap='viridis', marker='.', s=10, alpha=0.5, edgecolors='none')

# new_z_viz = viz.transform(new_z_list)
# new_y_list = np.array(train_y[500:])
# new_z_viz = MinMaxScaler().fit_transform(new_z_viz)
# plt.scatter(new_z_viz[:, 0], new_z_viz[:, 1], c=new_y_list, marker='x', s=50, alpha=1, edgecolors='none')

# plt.colorbar(scatter)

### All z reconstruction

In [None]:
all_df = generate_df(all_z_list, range(len(all_z_list)), model, config, nan_qed, nan_sa, 
                      temp=1.0, test=True)

all_df.to_csv(f"./after_optim/vae_prop_{data_type}_all_n{sample_num}_ep{num_iterations}.csv", index=False)
# best_df.to_csv(f"./after_optim/vae_{data_type}_all_n{sample_num}_ep{num_iterations}.csv", index=False)

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(range(len(all_z_list)), all_df.obj.values)
plt.title('Objective function with all iteration')
plt.xlabel('Iteration')
plt.ylabel('Objective Function')
plt.xticks(range(len(all_z_list), 2), fontsize=10)
plt.show()

In [None]:
# vizualizeMol(all_df, data_type=data_type)

In [None]:
# viz = PCA(n_components=2)
# z_viz = viz.fit_transform(train_z[:500])
# y_list = np.array(train_y[:500])

# explained_variance = viz.explained_variance_ratio_
# z_viz = MinMaxScaler().fit_transform(z_viz)

# scatter = plt.scatter(z_viz[:, 0], z_viz[:, 1], c=y_list, cmap='viridis', marker='.', s=10, alpha=0.5, edgecolors='none')

# new_z_viz = viz.transform(new_z_list)
# new_y_list = np.array(train_y[500:])
# new_z_viz = MinMaxScaler().fit_transform(new_z_viz)
# plt.scatter(new_z_viz[:, 0], new_z_viz[:, 1], c=new_y_list, marker='x', s=50, alpha=1, edgecolors='none')

# plt.colorbar(scatter)

### Viz fer step

In [None]:
step_df = generate_df(new_z_step, step_list, model, config, nan_qed, nan_sa, 
                      temp=1.0, test=True)

step_df.to_csv(f"./after_optim/vae_prop_{data_type}_step40_n{sample_num}_ep{num_iterations}.csv", index=False)
# best_df.to_csv(f"./after_optim/vae_{data_type}_step40_n{sample_num}_ep{num_iterations}.csv", index=False)

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(step_list, step_df.obj.values)
plt.title('Objective function with all iteration')
plt.xlabel('Iteration')
plt.ylabel('Objective Function')
plt.xticks(step_list, fontsize=10)
plt.show()

In [None]:
vizualizeMol(step_df, data_type=data_type)

In [None]:
# viz = PCA(n_components=2)
# z_viz = viz.fit_transform(train_z[:500])
# y_list = np.array(train_y[:500])

# explained_variance = viz.explained_variance_ratio_
# z_viz = MinMaxScaler().fit_transform(z_viz)

# scatter = plt.scatter(z_viz[:, 0], z_viz[:, 1], c=y_list, cmap='viridis', marker='.', s=10, alpha=0.5, edgecolors='none')

# new_z_viz = viz.transform(new_z_list)
# new_y_list = np.array(train_y[500:])
# new_z_viz = MinMaxScaler().fit_transform(new_z_viz)
# plt.scatter(new_z_viz[:, 0], new_z_viz[:, 1], c=new_y_list, marker='x', s=50, alpha=1, edgecolors='none')

# plt.colorbar(scatter)

## Latent Vector Interpolation

In [None]:
import torch
import numpy as np
import pandas as pd
from viz_utils import slerp, InterpolationLoader, z_to_smiles

In [None]:
# model_type = 'vae_property' # 'vae_property', 'vae'
model_type = 'vae' # 'vae_property', 'vae'
data_type = 'selfies'  # 'selfies'
# data_type = 'smiles'
steps = 4
epoch = 60
sample_1 = 3
sample_2 = 10

In [None]:
z_list, y_list, _, train_data, model = InterpolationLoader(dataPATH="../moses/dataset/data/ZINC250K/",
                                                    model_type=model_type,
                                                    data_type=data_type,
                                                    best_epoch=epoch,
                                                    i_1=sample_1, i_2=sample_2,
                                                    )

original_mol = train_data[:,0]

interpolated_latents = torch.tensor(np.array([slerp(val, z_list[0,:], z_list[1,:]) for val in np.linspace(0, 1, steps)]))
viz_df = z_to_smiles(model, original_mol, interpolated_latents,
                     data_type=data_type, steps=steps,
                     temp=0.3, argmax=False)

In [None]:
if data_type == 'smiles':
    result_mol = viz_df['SMILES'].values
else:
    result_mol = viz_df["SELFIES"].values
    
result_mol

## Latent Space Vizualization

In [None]:
data_type = 'smiles' # 'selfies'
# data_type = 'selfies' # 'selfies'

In [None]:
train_df = pd.read_csv("../moses/dataset/data/ZINC250K/train.csv")
test_df = pd.read_csv("../moses/dataset/data/ZINC250K/test.csv")

In [None]:
# if data_type == 'selfies':
#     folder_path = "../checkpoints/ZINC250K_vae_property_obj_proploss_w0.1_selfies"
# else:
    # folder_path = "../checkpoints/ZINC250K_vae_property_obj_proploss_w0.1_smiles"
    
if data_type == 'selfies':
    folder_path = "../checkpoints/ZINC250K_vae_selfies"
else:
    folder_path = "../checkpoints/ZINC250K_vae_smiles"

    
# config = torch.load(f'{folder_path}/vae_property_config.pt')
# vocab = torch.load(f'{folder_path}/vae_property_vocab.pt')

config = torch.load(f'{folder_path}/vae_config.pt')
vocab = torch.load(f'{folder_path}/vae_vocab.pt')

if data_type == 'selfies':
    print(f"Use Selfies: {config.use_selfies}")
    print(config.reg_prop_tasks)

cols = ['SELFIES' if config.use_selfies else 'SMILES', 'logP', 'qed', 'SAS', 'obj']
train_data = train_df[cols].values
test_data = test_df[cols].values

# model_path = f'{folder_path}/vae_property_model_080.pt'
# model = VAEPROPERTY(vocab, config)
# model.load_state_dict(torch.load(model_path))
# trainer = VAEPROPERTYTrainer(config)

model_path = f'{folder_path}/vae_model.pt'

model = VAE(vocab, config)
model.load_state_dict(torch.load(model_path))
trainer = VAETrainer(config)


train_loader = trainer.get_dataloader(model, train_data, shuffle=False)
test_loader = trainer.get_dataloader(model, test_data, shuffle=False)

In [None]:
fig, axes = plt.subplots(1, 6, figsize=(30, 4))

for i, epoch in enumerate(['00', 20, 40, 60, 80, 'final']):
    
    # model_path = f'{folder_path}/vae_property_model_0{epoch}.pt'
    model_path = f'{folder_path}/vae_model_0{epoch}.pt'
    
    if epoch == 'final':
        model_path = f'{folder_path}/vae_property_model.pt'
        
    # model = VAEPROPERTY(vocab, config)
    # model.load_state_dict(torch.load(model_path))
    
    model = VAE(vocab, config)
    model.load_state_dict(torch.load(model_path))
    
    model.eval()

    x_list = []
    z_list = []
    mu_list = []
    logvar_list = []
    y_list = []

    # for step, batch in enumerate(train_loader):
    #     x = batch[0]
    #     y = batch[1]
    #     x_list.extend(x)
    #     y_list.extend(np.array(y).squeeze())

    #     mu, logvar, z, _ = model.forward_encoder(x)
    #     z_list.extend(z.detach().cpu().numpy())
    #     mu_list.extend(mu.detach().cpu().numpy())
    #     logvar_list.extend(logvar.detach().cpu().numpy())
    
    for step, batch in enumerate(train_loader):
        
        x_list.extend(batch)
        # y_list.extend(np.array(batch[-1]).squeeze())

        mu, logvar, z, _ = model.forward_encoder(batch)
        z_list.extend(z.detach().cpu().numpy())
        mu_list.extend(mu.detach().cpu().numpy())
        logvar_list.extend(logvar.detach().cpu().numpy())

    viz = PCA(n_components=2)
    z_viz = viz.fit_transform(mu_list)
    explained_variance = viz.explained_variance_ratio_
    print(f"(Epoch {epoch})Explained variance: {explained_variance}")
    
    y_list = np.array(y_list)[:, -1]
    
    # print(z_viz.shape)
    z_viz = MinMaxScaler().fit_transform(z_viz)

    scatter = axes[i].scatter(z_viz[:, 0], z_viz[:, 1], c=y_list, cmap='viridis', marker='.', s=10, alpha=0.5, edgecolors='none')

    axes[i].set_title(f'Epoch {epoch}')
    axes[i].set_xlabel('PC1')
    axes[i].set_ylabel('PC2')
    
    fig.colorbar(scatter, ax=axes[i])
    
plt.tight_layout()
plt.show()

## Optimization latent space

In [None]:
data_type = 'selfies' # 'smiles', 'selfies
n_sample = 1000
n_epoch = 200
df = pd.read_csv("../moses/dataset/data/ZINC250K/train.csv")
gen_df = pd.read_csv(f"./after_optim/vae_prop_{data_type}_best_n{n_sample}_ep{n_epoch}.csv")
all_df = pd.read_csv(f"./after_optim/vae_prop_{data_type}_all_n{n_sample}_ep{n_epoch}.csv")

In [None]:
gen_df

In [None]:
viz_latent_with_optim(df, gen_df, all_df, data_type=data_type, model_type='vae_property',
                      base_pca='mu')

In [None]:
viz_latent_with_optim(df, gen_df, all_df, data_type=data_type, model_type='vae_property',
                      base_pca='z')

## Analysis of optimization results