In [1]:
! python --version

Python 3.6.5 :: Anaconda, Inc.


In [2]:
%env CUDA_VISIBLE_DEVICES=2

env: CUDA_VISIBLE_DEVICES=2


In [3]:
%load_ext autoreload
%autoreload 2
use_cuda = torch.cuda.is_available()

In [4]:
from __future__ import print_function
from __future__ import division
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
import torch 
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable


torch.manual_seed(1)

<torch._C.Generator at 0x1824bd642d0>

In [6]:
#from sklearn.metrics import r2_score
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem.Fingerprints import FingerprintMols

In [7]:
import time
import sys

In [9]:
#import data_preprocessing as dp
import csv

In [10]:
filename = './data/jak2_data.csv' 

In [11]:
from data import read_smiles_property_file
from data import cross_validation_split
from data import PredictorData

In [12]:
my_data = PredictorData(filename)

In [13]:
tokens = ['<', '>', '#', '%', ')', '(', '+', '-', '/', '.', '1', '0', '3', '2', '5', '4', '7',
          '6', '9', '8', '=', 'A', '@', 'C', 'B', 'F', 'I', 'H', 'O', 'N', 'P', 'S', '[', ']',
          '\\', 'c', 'e', 'i', 'l', 'o', 'n', 'p', 's', 'r', ' ']
char2idx = {}
my_data.load_dictionary(tokens, char2idx)

In [14]:
lens = []
for sm in my_data.smiles:
    lens.append(len(sm))
max_len = max(lens)

In [15]:
for i in range(len(my_data.smiles)):
    l = len(my_data.smiles[i])
    my_data.smiles[i] = my_data.smiles[i] + ' '*(max_len - l)

In [16]:
cross_val_data, cross_val_labels = cross_validation_split(my_data.smiles, my_data.property)

In [17]:
def batch_char_tensor(smiles, use_cuda):
    tensor = torch.zeros(len(smiles), len(smiles[0])).long()
    for i in range (len(smiles)):
        string = smiles[i]
        for c in range(len(string)):
            tensor[i, c] = self.all_characters.index(string[c])
    if use_cuda:
        return Variable(tensor.cuda())
    else:
        return Variable(tensor)

In [18]:
def iterate_minibatches(X, y, batchsize=100):
    n = X.shape[0]
    ind = np.random.permutation(n)
    for start_index in range(0, n, batchsize):
        X_batch = batch_char_tensor(X[ind[start_index:start_index + batchsize]])
        y_batch = y[ind[start_index:start_index + batchsize], :]
        if use_cuda:
            yield (X_batch, torch.from_numpy(y_batch).float().cuda())
        else:
            yield (X_batch, torch.from_numpy(y_batch).float())

## SMILES based QSAR with Recurrent Neural Network

In [19]:
from RecurrentQSAR import RecurrentQSAR

In [20]:
models = []
train_logs = []
val_logs = []
num_epochs = 100
batch_size = 100

for i in range(5):
    
    train_loss_log = []
    val_loss_log = []
    
    models.append(RecurrentQSAR(input_dim=my_data.n_characters, data=my_data))
    models[i].cuda()
    criterion = nn.MSELoss()
    optimizer = optim.Adadelta(models[i].parameters(), lr=0.1,  weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
    
    trX = np.concatenate(cross_val_data[:i] + cross_val_data[i+1:])
    trY = np.concatenate(cross_val_labels[:i] + cross_val_labels[i+1:])
    teX = np.array(cross_val_data[i])
    teY = np.array(cross_val_labels[i])

    for epoch in range(num_epochs):
        scheduler.step()
        models[i].fit(criterion, optimizer, trX, trY.reshape(-1), train_loss_log, num_epochs=1, batch_size=batch_size)
        models[i].validate(teX, teY, batch_size = batch_size, val_loss_log=val_loss_log)
        
    train_logs.append(train_loss_log)
    val_logs.append(val_loss_log)
    plt.plot(train_loss_log)
    plt.plot(val_loss_log)

RuntimeError: Cannot initialize CUDA without ATen_cuda library. PyTorch splits its backend into two shared libraries: a CPU library and a CUDA library; this error has occurred because you are trying to use some CUDA functionality, but the CUDA library has not been loaded by the dynamic linker for some reason.  The CUDA library MUST be loaded, EVEN IF you don't directly use any symbols from the CUDA library! One common culprit is a lack of -Wl,--no-as-needed in your link arguments; many dynamic linkers will delete dynamic library dependencies if you don't depend on any of their symbols.  You can check if this has occurred by using ldd on your binary to see if there is a dependency on *_cuda.so library.

In [None]:
optimizer.state_dict()