In [2]:
%load_ext autoreload
%autoreload 2
import numpy as np
import json
import wandb
import dfs_code
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import sys
sys.path = ['../../src'] + sys.path
from dfs_transformer import DFSCodeSeq2SeqFC, TrainerNew, PubChem, get_n_files, TransformerPlusHeads
from dfs_transformer.training.utils import seq_loss, seq_acc, collate_BERT, collate_rnd2min
import argparse
import yaml
import functools
from ml_collections import ConfigDict
from copy import deepcopy
import pickle
from sklearn.metrics import r2_score
import resource
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (2*2048, rlimit[1]))
#torch.multiprocessing.set_sharing_strategy('file_system')



args = ConfigDict({
    'yaml_model':"../../config/selfattn/model/bert.yaml",
    'yaml_data':"../../config/selfattn/data/pubchem1M.yaml",
    'yaml_training':"../../config/selfattn/training/min2min_new.yaml"
})

config = ConfigDict({'model':{}, 'data':{}, 'training':{}})
with open(args.yaml_model) as file:
    config.model = ConfigDict(yaml.load(file, Loader=yaml.FullLoader))
with open(args.yaml_data) as file:
    config.data = ConfigDict(yaml.load(file, Loader=yaml.FullLoader))
with open(args.yaml_training) as file:
    config.training = ConfigDict(yaml.load(file, Loader=yaml.FullLoader))

m = deepcopy(config.model)
t = deepcopy(config.training)
d = deepcopy(config.data)

device = torch.device('cuda:%d'%config.training.gpu_id if torch.cuda.is_available() else 'cpu')

m['no_features'] = False
m['use_loops'] = False

d.molecular_properties = []

config.training.fraction_missing=0.5


if t.mode == "min2min":
    collate_fn = functools.partial(collate_BERT, 
                                   mode="min2min", 
                                   fraction_missing = config.training.fraction_missing,
                                   use_loops=m.use_loops)
elif t.mode == "rnd2rnd":
    collate_fn = functools.partial(collate_BERT, 
                                   mode="rnd2rnd", 
                                   fraction_missing = config.training.fraction_missing,
                                   use_loops=m.use_loops)
elif t.mode == "rnd2min":
    collate_fn = functools.partial(collate_rnd2min,
                                       use_loops=m.use_loops)

validloader = None
if d.valid_path is not None:
    validset = PubChem(d.valid_path, max_nodes=m.max_nodes, max_edges=m.max_edges, noFeatures=m.no_features,
                       molecular_properties=None)
    validloader = DataLoader(validset, batch_size=t.batch_size, shuffle=True, 
                             pin_memory=True, collate_fn=collate_fn)
    exclude = validset.smiles

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  2.76it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9978/9978 [00:01<00:00, 7976.99it/s]


In [3]:
from dfs_transformer import to_cuda as utils_to_cuda
import tqdm
to_cuda = functools.partial(utils_to_cuda, device=device)

In [None]:
dataset = PubChem(d.path, n_used = d.n_used, max_nodes=m.max_nodes, 
                  max_edges=m.max_edges, exclude=exclude, noFeatures=m.no_features,
                  molecular_properties=d.molecular_properties)
loader = DataLoader(dataset, batch_size=t.batch_size, shuffle=True, 
                    pin_memory=t.pin_memory, collate_fn=collate_fn, num_workers=0,#t.num_workers,
                    prefetch_factor=t.prefetch_factor)

 29%|██████████████████████████████████████████████████▎                                                                                                                             | 2/7 [00:35<01:28, 17.67s/it]

In [None]:
data = next(iter(loader))

In [None]:
batch_id = 1

In [None]:
data[0]['dfs_from'][:, batch_id]

In [None]:
data[0]['dfs_to'][:, batch_id]

In [None]:
data[0]['atm_from'][:, batch_id]

In [None]:
data[0]['bnd'][:, batch_id]

In [None]:
data[0]['atm_to'][:, batch_id][:, :118]

In [None]:
data[1][0].shape

In [None]:
n_vertices = []
n_edges = []

In [None]:
for data in tqdm.tqdm(loader):
    for vfeats in data[1]:
        n_vertices += [vfeats.shape[0]]
    for efeats in data[2]:
        n_edges += [efeats.shape[0]]

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.hist(n_vertices, bins='rice')

In [None]:
plt.hist(n_edges, bins='rice')

In [None]:
n_edges = np.asarray(n_edges)
print(len(n_edges), len(n_edges) - (n_edges > 150).sum())

# moleculenet downstream tasks

In [None]:
import deepchem as dc
from dfs_transformer import smiles2graph

dataset = 'clintox'



In [None]:
for dataset in ['clintox', 'tox21', 'bbbp', 'hiv']:
    if dataset == 'clintox':
        tasks, datasets, transformers = dc.molnet.load_clintox(reload=False, featurizer=dc.feat.RawFeaturizer(True), splitter=None)
    elif dataset == 'tox21':
        tasks, datasets, transformers = dc.molnet.load_tox21(reload=False, featurizer=dc.feat.RawFeaturizer(True), splitter=None)
    elif dataset == 'hiv':
        tasks, datasets, transformers = dc.molnet.load_hiv(reload=False, featurizer=dc.feat.RawFeaturizer(True), splitter=None)
    elif dataset == 'bbbp':
        tasks, datasets, transformers = dc.molnet.load_bbbp(reload=False, featurizer=dc.feat.RawFeaturizer(True), splitter=None)


    m_vertices = []
    m_edges = []
    for idx, smiles in tqdm.tqdm(enumerate(datasets[0].X)):
        d = smiles2graph(smiles, False, False, np.inf, np.inf)
        m_vertices += [d.atom_features.shape[0]]
        m_edges += [d.edge_index.shape[1]//2]

    plt.hist(m_edges, bins='rice')
    plt.title(dataset)
    plt.show()