In [71]:
import torch
import random

from tqdm import tqdm

from hgraph.mol_graph import MolGraph
from hgraph.encoder import HierMPNEncoder
from hgraph.decoder import HierMPNDecoder
from hgraph.vocab import Vocab, PairVocab, common_atom_vocab
from hgraph.hgnn import HierVAE, HierVGNN, HierCondVGNN
from hgraph.dataset import MoleculeDataset, MolPairDataset, DataFolder, MolEnumRootDataset

from multiprocessing import Pool

In [72]:
torch.cuda.is_available()

True

In [73]:
import argparse

parser = argparse.ArgumentParser(description='Foo')

parser.add_argument('--train', default='train_processed/')
parser.add_argument('--vocab', default='data/chembl/vocab.txt')
parser.add_argument('--atom_vocab', default=common_atom_vocab)
parser.add_argument('--save_dir', default='ckpt/chembl-pretrained')
parser.add_argument('--model', default='ckpt/chembl-pretrained/model.ckpt')
parser.add_argument('--seed', type=int, default=7)

parser.add_argument('--rnn_type', type=str, default='LSTM')
parser.add_argument('--hidden_size', type=int, default=250)
parser.add_argument('--embed_size', type=int, default=250)
parser.add_argument('--batch_size', type=int, default=50)
parser.add_argument('--latent_size', type=int, default=32)
parser.add_argument('--depthT', type=int, default=15)
parser.add_argument('--depthG', type=int, default=15)
parser.add_argument('--diterT', type=int, default=1)
parser.add_argument('--diterG', type=int, default=3)
parser.add_argument('--dropout', type=float, default=0.0)

parser.add_argument('--epoch', type=int, default=20)
parser.add_argument('--anneal_rate', type=float, default=0.9)
parser.add_argument('--anneal_iter', type=int, default=25000)
parser.add_argument('--print_iter', type=int, default=50)
parser.add_argument('--save_iter', type=int, default=5000)

parser.add_argument('--nsample', type=int, default=100)

parser.add_argument('--ncpu', type=int, default=8)

args = parser.parse_args([])


torch.manual_seed(args.seed)
random.seed(args.seed)


In [74]:
vocab = [x.strip("\r\n ").split() for x in open(args.vocab)]
args.vocab = PairVocab(vocab)

In [76]:
model = HierVAE(args)

model_state, optimizer_state, total_step, beta = torch.load(args.model)
model.load_state_dict(model_state)
model.eval()
model.cuda()



HierVAE(
  (encoder): HierMPNEncoder(
    (E_c): Sequential(
      (0): Embedding(1578, 250)
      (1): Dropout(p=0.0, inplace=False)
    )
    (E_i): Sequential(
      (0): Embedding(5623, 250)
      (1): Dropout(p=0.0, inplace=False)
    )
    (W_c): Sequential(
      (0): Linear(in_features=500, out_features=250, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.0, inplace=False)
    )
    (W_i): Sequential(
      (0): Linear(in_features=500, out_features=250, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.0, inplace=False)
    )
    (W_root): Sequential(
      (0): Linear(in_features=500, out_features=250, bias=True)
      (1): Tanh()
    )
    (tree_encoder): MPNEncoder(
      (W_o): Sequential(
        (0): Linear(in_features=500, out_features=250, bias=True)
        (1): ReLU()
        (2): Dropout(p=0.0, inplace=False)
      )
      (rnn): LSTM(
        (W_i): Sequential(
          (0): Linear(in_features=520, out_features=250, bias=True)
          (1): Sigmoid()
       

In [77]:
with torch.no_grad():
    for _ in tqdm(range(args.nsample // args.batch_size)):
        smiles_list = model.sample(args.batch_size, greedy=True)

100%|██████████| 2/2 [00:05<00:00,  2.77s/it]


In [6]:
model.encoder(smiles_list)

TypeError: forward() missing 1 required positional argument: 'graph_tensors'

In [8]:
args.train = '/home/adam/Projects/hgraph2graph/data/chembl/all.txt'


dataset = DataFolder(args.train, args.batch_size)

for batch in tqdm(dataset):
    total_step += 1

NotADirectoryError: [Errno 20] Not a directory: '/home/adam/Projects/hgraph2graph/data/chembl/all.txt'

In [44]:
with open(args.vocab) as f:
    vocab = [x.strip("\r\n ").split() for x in f]
args.vocab = PairVocab(vocab, cuda=False)

In [45]:
def tensorize(mol_batch, vocab):
    x = MolGraph.tensorize(mol_batch, vocab, common_atom_vocab)
    return to_numpy(x)

def to_numpy(tensors):
    convert = lambda x : x.numpy() if type(x) is torch.Tensor else x
    a,b,c = tensors
    b = [convert(x) for x in b[0]], [convert(x) for x in b[1]]
    return a, b, c

In [54]:
pool = Pool(args.ncpu)

batches = [smiles_list[i : i + args.batch_size] for i in range(0, len(smiles_list), args.batch_size)]
func = tensorize(mol_batch=smiles_list, vocab=args.vocab)
all_data = pool.map(func, batches[0])
num_splits = max(len(all_data) // 1000, 1)

le = (len(all_data) + num_splits - 1) // num_splits

for split_id in range(num_splits):
    st = split_id * le
    sub_data = all_data[st : st + le]


TypeError: 'tuple' object is not callable

In [57]:
func[0]

(<networkx.classes.digraph.DiGraph at 0x7fe64a3c9370>,
 <networkx.classes.digraph.DiGraph at 0x7fe64413da00>)

In [25]:
MolGraph(smiles_list[0])

<hgraph.mol_graph.MolGraph at 0x7fe649e4b880>

In [26]:
mol_batch = smiles_list
mol_batch = [MolGraph(x) for x in mol_batch]

In [27]:
tree_tensors, tree_batchG = MolGraph.tensorize_graph([x.mol_tree for x in mol_batch], vocab)

TypeError: list indices must be integers or slices, not tuple

In [31]:
mol_batch[0].mol_tree.G

AttributeError: 'DiGraph' object has no attribute 'G'

In [23]:

graph_tensors, graph_batchG = MolGraph.tensorize_graph([x.mol_graph for x in mol_batch], avocab)
tree_scope = tree_tensors[-1]
graph_scope = graph_tensors[-1]

max_cls_size = max( [len(c) for x in mol_batch for c in x.clusters] )
cgraph = torch.zeros(len(tree_batchG) + 1, max_cls_size).int()
for v,attr in tree_batchG.nodes(data=True):
    bid = attr['batch_id']
    offset = graph_scope[bid][0]
    tree_batchG.nodes[v]['inter_label'] = inter_label = [(x + offset, y) for x,y in attr['inter_label']]
    tree_batchG.nodes[v]['cluster'] = cls = [x + offset for x in attr['cluster']]
    tree_batchG.nodes[v]['assm_cands'] = [add(x, offset) for x in attr['assm_cands']]
    cgraph[v, :len(cls)] = torch.IntTensor(cls)

all_orders = []
for i,hmol in enumerate(mol_batch):
    offset = tree_scope[i][0]
    order = [(x + offset, y + offset, z) for x,y,z in hmol.order[:-1]] + [(hmol.order[-1][0] + offset, None, 0)]
    all_orders.append(order)

tree_tensors = tree_tensors[:4] + (cgraph, tree_scope)
return (tree_batchG, graph_batchG), (tree_tensors, graph_tensors), all_orders

<hgraph.vocab.PairVocab at 0x7fe7266de130>

In [32]:
MoleculeDataset(smiles_list, vocab, )

TypeError: __init__() missing 3 required positional arguments: 'vocab', 'avocab', and 'batch_size'

In [58]:
smiles_list

['Cc1c(Nc2ccnc(N3CC4CC3CN4)n2)sc2ccc(F)cc12',
 'CCOC(=O)C(=NCCc1ccccc1)SCc1ccccc1',
 'CC(=O)OCCCCCC(C)(C)CO',
 'O=C(CC(O)c1ccccc1F)NC1CCCC1',
 'O=C(CNC(=O)C(Cc1ccc(O)cc1)NCP(=O)(O)O)NO',
 'CC(=O)N(C(=O)Nc1cc(-c2ccccc2)on1)C1CCCCC1',
 'O=C(NCCCl)c1ccccc1',
 'O=[N+]([O-])c1ccc(N=Nc2cccc(Cl)c2)cc1',
 'CCN(CC)C(=O)CCNC(=O)c1ccco1',
 'NC(CC(=O)O)C(=O)O',
 'NC(=O)C(Cc1ccccc1)CC(NC(=O)Cn1c(-c2ccccc2)nc2ccccc21)C(=O)NCc1ccccc1',
 'O=C(Cc1cccs1)OC1CCCC(F)(F)C1',
 'CCOC(=O)CSc1cc(Cl)c(Cl)cc1Cl',
 'COc1ccc(C#Cc2ccccc2F)cc1',
 'S=C(NCc1ccco1)n1ccnc1',
 'CN(C)CN1CCN=C1S',
 'CCOc1ccc(NC(=O)CCn2nnnc2C)cc1',
 'C=CC[n+]1c(C)cccc1C',
 'CSc1ccc(NC(=S)N2CCc3ccccc3C2)cc1',
 'CCC(CC)N(C(=O)COc1ccc(Cl)cc1)C1CCN(C(=O)c2ccccc2Cl)CC1',
 'O=C(Cc1ccccc1Cl)N1CC(Cc2ccccc2)CC1=O',
 'Cc1c(CCCSc2ccccc2)cccc1-c1nc2ccccc2o1',
 'Cc1cnc(-c2cnc(N3CCOCC3)c(C#N)c2)n1-c1ccc(F)cc1',
 'O=C(CCC(F)(F)F)c1ccccc1',
 'CCOC(=O)c1ccc2oc(-c3ccccc3)cc(=O)c2c1OCCc1cccc(OCC(=O)O)c1',
 'Cc1nn(C)c(C)c1Cn1ncc(N2CCC(Oc3ccccc3)CC2)c(Cl)c1=O',


In [78]:
for epoch in range(args.epoch):
    dataset = DataFolder(args.train, args.batch_size)

    for batch in tqdm(dataset):
        loss, kl_div, wacc, iacc, tacc, sacc = model(*batch, beta=beta)

  0%|          | 0/1000 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 1.83 GiB total capacity; 901.54 MiB already allocated; 19.81 MiB free; 930.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF