In [12]:
import math
import os

In [13]:
import deepchem as dc
from deepchem.utils import ScaffoldGenerator
from deepchem.utils.save import log
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

In [14]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

In [15]:
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
import numpy as np

In [16]:
import random
from collections import OrderedDict
from scipy.stats import pearsonr

In [17]:
random.seed(2)
np.random.seed(2)
torch.manual_seed(2)

<torch._C.Generator at 0x7f4cecf18a90>

In [38]:
def generate_scaffold(smiles, include_chirality=False):
    """Compute the Bemis-Murcko scaffold for a SMILES string."""
    mol = Chem.MolFromSmiles(smiles)
    engine = ScaffoldGenerator(include_chirality=include_chirality)
    scaffold = engine.get_scaffold(mol)
    return scaffold

In [39]:
def split(dataset,
          frac_train=.80,
          frac_valid=.10,
          frac_test=.10,
          log_every_n=1000):
    """
    Splits internal compounds into train/validation/test by scaffold.
    """
    np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
    scaffolds = {}
    log("About to generate scaffolds", True)
    data_len = len(dataset)

    for ind, smiles in enumerate(dataset):
        if ind % log_every_n == 0:
            log("Generating scaffold %d/%d" % (ind, data_len), True)
        scaffold = generate_scaffold(smiles)
        if scaffold not in scaffolds:
            scaffolds[scaffold] = [ind]
        else:
            scaffolds[scaffold].append(ind)

    scaffolds = {key: sorted(value) for key, value in scaffolds.items()}
    scaffold_sets = [
        scaffold_set
        for (scaffold, scaffold_set) in sorted(
            scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True)
    ]
    train_cutoff = frac_train * len(dataset)
    valid_cutoff = (frac_train + frac_valid) * len(dataset)
    train_inds, valid_inds, test_inds = [], [], []
    log("About to sort in scaffold sets", True)
    for scaffold_set in scaffold_sets:
        if len(train_inds) + len(scaffold_set) > train_cutoff:
            if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff:
                test_inds += scaffold_set    
            else:
                valid_inds += scaffold_set
        else:
            train_inds += scaffold_set
    return train_inds, valid_inds, test_inds

In [40]:
def donkey_load_dataset(filename, whiten=False):
    f = open(filename, 'r')
    features = []
    labels = []
    tracer = 0
    for line in f:
        if tracer == 0:
            tracer += 1
            continue
        splits =  line[:-1].split(',')
        features.append(splits[-1])
        labels.append(float(splits[-2]))
    features = np.array(features)
    labels = np.array(labels, dtype='float32').reshape(-1, 1)

    train_ind, val_ind, test_ins = split(features)

    train_features = np.take(features, train_ind)
    train_labels = np.take(labels, train_ind)
    val_features = np.take(features, val_ind)
    val_labels = np.take(labels, val_ind)

    return train_features, train_labels, val_features, val_labels

In [41]:
DATASET = 'az_ppb.csv'
print(DATASET)

az_ppb.csv


In [42]:
T = 3
BATCH_SIZE = 48
MAXITER = 40000
LIMIT = 0
LR = 5e-4

In [43]:
R = nn.Linear(150, 128)
U = {0: nn.Linear(156, 75), 1: nn.Linear(156, 75), 2: nn.Linear(156, 75)}
V = {0: nn.Linear(75, 75), 1: nn.Linear(75, 75), 2: nn.Linear(75, 75)}
E = nn.Linear(6, 6)

In [44]:
def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by .8 every 5 epochs"""
    lr = LR * (0.9 ** (epoch // 10))
    print('new lr [%.5f]' % lr)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [45]:
def load_dataset():
    train_features, train_labels, val_features, val_labels = donkey_load_dataset(DATASET)

    scaler = preprocessing.StandardScaler().fit(train_labels)
    train_labels = scaler.transform(train_labels)
    val_labels = scaler.transform(val_labels)

    train_labels = Variable(torch.FloatTensor(train_labels), requires_grad=False)
    val_labels = Variable(torch.FloatTensor(val_labels), requires_grad=False)

    return train_features, train_labels, val_features, val_labels

In [46]:
def readout(h, h2):
    catted_reads = map(lambda x: torch.cat([h[x[0]], h2[x[1]]], 1), zip(h2.keys(), h.keys()))
    activated_reads = map(lambda x: F.selu( R(x) ), catted_reads)
    readout = Variable(torch.zeros(1, 128))
    for read in activated_reads:
        readout = readout + read
    return F.tanh(readout)

In [47]:
def message_pass(g, h, k):
    for v in g.keys():
        neighbors = g[v]
        for neighbor in neighbors:
            e_vw = neighbor[0] # feature variable
            w = neighbor[1]

            m_w = V[k](h[w])
            m_e_vw = E(e_vw)
            reshaped = torch.cat( (h[v], m_w, m_e_vw), 1)
            h[v] = F.selu(U[k](reshaped))

In [48]:
def construct_multigraph(smile):
    g = OrderedDict({})
    h = OrderedDict({})
    molecule = Chem.MolFromSmiles(smile)
    for i in xrange(0, molecule.GetNumAtoms()):
        atom_i = molecule.GetAtomWithIdx(i)
        h[i] = Variable(torch.FloatTensor(dc.feat.graph_features.atom_features(atom_i))).view(1, 75)
        for j in xrange(0, molecule.GetNumAtoms()):
            e_ij = molecule.GetBondBetweenAtoms(i, j)
            if e_ij != None:
                e_ij =  map(lambda x: 1 if x == True else 0, dc.feat.graph_features.bond_features(e_ij)) # ADDED edge feat
                e_ij = Variable(torch.FloatTensor(e_ij).view(1, 6))
                atom_j = molecule.GetAtomWithIdx(j)
                if i not in g:
                    g[i] = []
                g[i].append( (e_ij, j) )
    return g, h

In [51]:
train_smiles, train_labels, val_smiles, val_labels = load_dataset()

About to generate scaffolds
Generating scaffold 0/1614


RDKit ERROR: [20:27:51] SMILES Parse Error: syntax error while parsing: %
RDKit ERROR: [20:27:51] SMILES Parse Error: Failed parsing SMILES '%' for input: '%'


ValueError: No molecule provided

In [50]:
linear = nn.Linear(128, 1)
params = [{'params': R.parameters()},
         {'params': U[0].parameters()},
         {'params': U[1].parameters()},
         {'params': U[2].parameters()},
         {'params': E.parameters()},
         {'params': V[0].parameters()},
         {'params': V[1].parameters()},
         {'params': V[2].parameters()},
         {'params': linear.parameters()}]

In [None]:
num_epoch = 0
optimizer = optim.Adam(params, lr=LR, weight_decay=1e-4)
for i in xrange(0, MAXITER):
    optimizer.zero_grad()
    train_loss = Variable(torch.zeros(1, 1))
    y_hats_train = []
    for j in xrange(0, BATCH_SIZE):
        sample_index = random.randint(0, len(train_smiles) - 2)
        smile = train_smiles[sample_index]
        g, h = construct_multigraph(smile) # TODO: cache this

        g2, h2 = construct_multigraph(smile)

        for k in xrange(0, T):
            message_pass(g, h, k)

        x = readout(h, h2)
        #x = F.selu( fc(x) )
        y_hat = linear(x)
        y = train_labels[sample_index]

        y_hats_train.append(y_hat)

        error = (y_hat - y)*(y_hat - y) / Variable(torch.FloatTensor([BATCH_SIZE])).view(1, 1)
        train_loss = train_loss + error

    train_loss.backward()
    optimizer.step()

    if i % int(len(train_smiles) / BATCH_SIZE) == 0:
        val_loss = Variable(torch.zeros(1, 1), requires_grad=False)
        y_hats_val = []
        for j in xrange(0, len(val_smiles)):
            g, h = construct_multigraph(val_smiles[j])
            g2, h2 = construct_multigraph(val_smiles[j])

            for k in xrange(0, T):
                message_pass(g, h, k)

            x = readout(h, h2)
            #x = F.selu( fc(x) )
            y_hat = linear(x)
            y = val_labels[j]

            y_hats_val.append(y_hat)

            error = (y_hat - y)*(y_hat - y) / Variable(torch.FloatTensor([len(val_smiles)])).view(1, 1)
            val_loss = val_loss + error

    y_hats_val = np.array(map(lambda x: x.data.numpy(), y_hats_val))
    y_val = np.array(map(lambda x: x.data.numpy(), val_labels))
    y_hats_val = y_hats_val.reshape(-1, 1)
    y_val = y_val.reshape(-1, 1)

    r2_val_old = r2_score(y_val, y_hats_val)
    r2_val_new = pearsonr(y_val, y_hats_val)[0]**2

    train_loss_ = train_loss.data.numpy()[0]
    val_loss_ = val_loss.data.numpy()[0]
    print 'epoch [%i/%i] train_loss [%f] val_loss [%f] r2_val_old [%.4f], r2_val_new [%.4f]' \
                  % (num_epoch, 100, train_loss_, val_loss_, r2_val_old, r2_val_new)
    num_epoch += 1