In [1]:
!pip install -q torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.12.0+cu116.html
!pip install -q dive-into-graphs

[0m

In [2]:
!pip install -q toolz
!pip install -q wandb

[0m

In [3]:
%load_ext autoreload
%autoreload 2
import os
import json
import argparse
import pickle 

import numpy as np
import pandas as pd
import torch

from molecule_optimizer.externals.fast_jtnn.datautils import SemiMolTreeFolder, SemiMolTreeFolderTest
from molecule_optimizer.runner.semi_jtvae import SemiJTVAEGeneratorPredictor
from torch_geometric.data import DenseDataLoader

import rdkit

lg = rdkit.RDLogger.logger() 
lg.setLevel(rdkit.RDLogger.CRITICAL)

import warnings
warnings.filterwarnings("ignore")

import math

In [4]:
conf = json.load(open("training/configs/rand_gen_zinc250k_config_dict.json"))

In [5]:
csv = pd.read_csv("ZINC_310k.csv")

In [6]:
smiles = csv['SMILES']

In [7]:
# smiles = smiles[:60000]

In [8]:
# labels = torch.tensor(csv['LogP'][:60000]).float()

In [10]:
labels = torch.tensor(csv['LogP']).float()

In [11]:
N_TEST = 10000
#N_TEST = 200
VAL_FRAC = 0.05
chem_prop = "LogP"
load_epoch = 143000

In [12]:
# if 'runner.xml' not in os.listdir("."):
#     runner = SemiJTVAEGeneratorPredictor(smiles)
#     processed_smiles, processed_idxs = SemiJTVAEGeneratorPredictor.preprocess(smiles) 
#     with open('runner.xml', 'wb') as f:
#         pickle.dump(runner, f)

In [11]:
# if 'runner_20.xml' not in os.listdir("."):
#     runner = SemiJTVAEGeneratorPredictor(smiles)
#     processed_smiles, processed_idxs = SemiJTVAEGeneratorPredictor.preprocess(smiles) 
#     with open('runner_20.xml', 'wb') as f:
#         pickle.dump(runner, f)

100%|██████████| 12/12 [00:35<00:00,  2.93s/it]
100%|██████████| 12/12 [19:09<00:00, 95.82s/it]


In [13]:
with open('saved/runner_LogP_50_1_iter_143000.xml', 'rb') as f:
    runner = pickle.load(f)

In [14]:
runner.get_model(
    "rand_gen",
    {
        "hidden_size": conf["model"]["hidden_size"],
        "latent_size": conf["model"]["latent_size"],
        "depthT": conf["model"]["depthT"],
        "depthG": conf["model"]["depthG"],
        "label_size": 1,
        "label_mean": float(torch.mean(labels)),
        "label_var": float(torch.var(labels)),
    },
)

In [11]:
labels = runner.get_processed_labels(labels, processed_idxs)
preprocessed = processed_smiles

In [12]:

perm_id=np.random.permutation(len(labels))

X_train = preprocessed[perm_id[N_TEST:]]
X_train_smiles = smiles[perm_id[N_TEST:]]
L_train = torch.tensor(labels.numpy()[perm_id[N_TEST:]])


X_test = preprocessed[perm_id[:N_TEST]]
X_test_smiles = smiles[perm_id[:N_TEST]]
L_test = torch.tensor(labels.numpy()[perm_id[:N_TEST]])

val_cut = math.floor(len(X_train) * VAL_FRAC)

X_Val = X_train[:val_cut]
X_Val_smiles = X_train_smiles[:val_cut]
L_Val = L_train[:val_cut]

X_train = X_train[val_cut :]
X_train_smiles = X_train_smiles[val_cut :]
L_train = L_train[val_cut :]

with open("train_smiles_" + chem_prop + "_50_1.npy", 'wb') as f:
    np.save(f, X_train_smiles)

with open("test_smiles_" + chem_prop + "_50_1.npy", 'wb') as f:
    np.save(f, X_test_smiles)

with open("validation_smiles_" + chem_prop + "_50_1.npy", 'wb') as f:
    np.save(f, X_Val_smiles)

#save preproccessed

with open("train_" + chem_prop + "_50_1.npy", 'wb') as f:
    np.save(f, X_train)

with open("test_" + chem_prop + "_50_1.npy", 'wb') as f:
    np.save(f, X_test)

with open("validation_" + chem_prop + "_50_1.npy", 'wb') as f:
    np.save(f, X_Val)

#Save labels

torch.save(L_train, "L_train_" + chem_prop + "_50_1.pt")

torch.save(L_test, "L_test_" + chem_prop + "_50_1.pt")

torch.save(L_Val, "L_Val_" + chem_prop + "_50_1.pt")

In [15]:
L_train = torch.load("L_train_" + chem_prop + "_50_1.pt")
L_test = torch.load("L_test_" + chem_prop + "_50_1.pt")
L_Val = torch.load("L_Val_" + chem_prop + "_50_1.pt")

with open("train_" + chem_prop + "_50_1.npy", 'rb') as f:
    X_train = np.load(f, allow_pickle=True)

with open("test_" + chem_prop + "_50_1.npy", 'rb') as f:
    X_test = np.load(f, allow_pickle=True)

with open("validation_" + chem_prop + "_50_1.npy", 'rb') as f:
    X_Val = np.load(f, allow_pickle=True)

In [None]:
# print("Training model...")
# runner.train_gen_pred(
#     X_train,
#     L_train,
#     X_test,
#     L_test,
#     X_Val,
#     L_Val,
#     load_epoch= 1000,
#     lr=conf["lr"],
#     anneal_rate=conf["anneal_rate"],
#     clip_norm=conf["clip_norm"],
#     num_epochs=conf["num_epochs"],
#     alpha=conf["alpha"],
#     max_alpha=conf["max_alpha"],
#     step_alpha=conf["step_alpha"],
#     beta=conf["beta"],
#     max_beta=conf["max_beta"],
#     step_beta=conf["step_beta"],
#     anneal_iter=conf["anneal_iter"],
#     alpha_anneal_iter=conf["alpha_anneal_iter"],
#     kl_anneal_iter=conf["kl_anneal_iter"],
#     print_iter=100,
#     save_iter= 1000,
#     batch_size=conf["batch_size"],
#     num_workers=conf["num_workers"],
#     label_pct=0.5,
#     chem_prop = "LogP"
# )

Training model...
Model #Params: 4732K
[Train][1100] Alpha: 0.000, Beta: 0.000, Loss: 40.79, KL: 679.81, MAE: 0.85188, Word Loss: 28.13, Topo Loss: 7.37, Assm Loss: 5.29, Pred Loss: 0.84, Word: 75.06, Topo: 94.72, Assm: 75.16, PNorm: 153.08, GNorm: 47.52
[Train][1200] Alpha: 0.000, Beta: 0.000, Loss: 39.47, KL: 717.46, MAE: 0.84108, Word Loss: 27.26, Topo Loss: 6.95, Assm Loss: 5.26, Pred Loss: 0.81, Word: 76.19, Topo: 95.11, Assm: 75.32, PNorm: 157.58, GNorm: 35.72
[Train][1300] Alpha: 0.000, Beta: 0.000, Loss: 37.01, KL: 750.64, MAE: 0.85575, Word Loss: 25.91, Topo Loss: 6.35, Assm Loss: 4.75, Pred Loss: 0.83, Word: 76.89, Topo: 95.61, Assm: 77.72, PNorm: 161.90, GNorm: 50.00
[Train][1400] Alpha: 0.000, Beta: 0.000, Loss: 36.21, KL: 796.35, MAE: 0.81510, Word Loss: 25.03, Topo Loss: 6.43, Assm Loss: 4.75, Pred Loss: 0.78, Word: 77.64, Topo: 95.35, Assm: 77.52, PNorm: 164.86, GNorm: 42.02
[Train][1500] Alpha: 0.000, Beta: 0.000, Loss: 34.63, KL: 834.32, MAE: 0.76559, Word Loss: 23.85,

In [None]:
print("Training model...")
runner.train_gen_pred_supervised(
    X_train,
    L_train,
    X_test,
    L_test,
    X_Val,
    L_Val,
    load_epoch= load_epoch,
    lr=conf["lr"],
    anneal_rate=conf["anneal_rate"],
    clip_norm=conf["clip_norm"],
    num_epochs=conf["num_epochs"],
    alpha=410.0,
    max_alpha=conf["max_alpha"],
    step_alpha=conf["step_alpha"],
    beta=0.1,
    max_beta=conf["max_beta"],
    step_beta=conf["step_beta"],
    anneal_iter=conf["anneal_iter"],
    alpha_anneal_iter=conf["alpha_anneal_iter"],
    kl_anneal_iter=conf["kl_anneal_iter"],
    print_iter=100,
    save_iter= 1000,
    batch_size=conf["batch_size"],
    num_workers=conf["num_workers"],
    label_pct=0.5,
    chem_prop = chem_prop
)

Training model...
Model #Params: 5207K
[Train][143100] Alpha: 410.000, Beta: 0.100, Loss: 11.80, KL: 54.51, MAE: 0.06099, Word Loss: 2.86, Topo Loss: 0.57, Assm Loss: 0.58, Pred Loss: 0.01, Word: 93.88, Topo: 99.28, Assm: 95.55, PNorm: 934.60, GNorm: 50.00
[Train][143200] Alpha: 410.000, Beta: 0.100, Loss: 10.93, KL: 54.89, MAE: 0.05701, Word Loss: 2.68, Topo Loss: 0.56, Assm Loss: 0.56, Pred Loss: 0.00, Word: 94.08, Topo: 99.31, Assm: 95.51, PNorm: 935.23, GNorm: 50.00
[Train][143300] Alpha: 410.000, Beta: 0.100, Loss: 11.45, KL: 54.70, MAE: 0.06046, Word Loss: 2.85, Topo Loss: 0.55, Assm Loss: 0.62, Pred Loss: 0.00, Word: 93.83, Topo: 99.29, Assm: 94.69, PNorm: 935.90, GNorm: 50.00
[Train][143400] Alpha: 410.000, Beta: 0.100, Loss: 11.14, KL: 54.67, MAE: 0.05561, Word Loss: 2.83, Topo Loss: 0.58, Assm Loss: 0.61, Pred Loss: 0.00, Word: 93.68, Topo: 99.22, Assm: 95.13, PNorm: 936.44, GNorm: 50.00
[Train][143500] Alpha: 410.000, Beta: 0.100, Loss: 11.22, KL: 54.28, MAE: 0.05734, Word L