In [1]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import argparse
import numpy as np
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.5
config.gpu_options.allow_growth = True
import yaml
import time
import os
from keras import backend as K
from keras.models import Model
from keras.optimizers import SGD, Adam, RMSprop
import hyperparameters
import mol_utils as mu
import mol_callbacks as mol_cb
from keras.callbacks import CSVLogger
from models import encoder_model, load_encoder
from models import decoder_model, load_decoder
from models import varLayer, load_varLayer
from models import property_predictor_model, load_property_predictor
from models import variational_layers, varLayer
from functools import partial
from keras.layers import Lambda


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:

def vectorize_data(params):
    # @out : Y_train /Y_test : each is list of datasets.
    #        i.e. if reg_tasks only : Y_train_reg = Y_train[0]
    #             if logit_tasks only : Y_train_logit = Y_train[0]
    #             if both reg and logit_tasks : Y_train_reg = Y_train[0], Y_train_reg = 1
    #             if no prop tasks : Y_train = []

    MAX_LEN = params['MAX_LEN']

    CHARS = yaml.safe_load(open(params['char_file']))
    params['NCHARS'] = len(CHARS)
    NCHARS = len(CHARS)
    CHAR_INDICES = dict((c, i) for i, c in enumerate(CHARS))
    #INDICES_CHAR = dict((i, c) for i, c in enumerate(CHARS))

    ## Load data for properties
    if params['do_prop_pred'] and ('data_file' in params):
        if "data_normalization_out" in params:
            normalize_out = params["data_normalization_out"]
        else:
            normalize_out = None

        ################
        if ("reg_prop_tasks" in params) and ("logit_prop_tasks" in params):
            smiles, Y_reg, Y_logit = mu.load_smiles_and_data_df(params['data_file'], MAX_LEN,
                    reg_tasks=params['reg_prop_tasks'], logit_tasks=params['logit_prop_tasks'],
                    normalize_out = normalize_out)
        elif "logit_prop_tasks" in params:
            smiles, Y_logit = mu.load_smiles_and_data_df(params['data_file'], MAX_LEN,
                    logit_tasks=params['logit_prop_tasks'], normalize_out=normalize_out)
        elif "reg_prop_tasks" in params:
            smiles, Y_reg = mu.load_smiles_and_data_df(params['data_file'], MAX_LEN,
                    reg_tasks=params['reg_prop_tasks'], normalize_out=normalize_out)
        else:
            raise ValueError("please sepcify logit and/or reg tasks")

    ## Load data if no properties
    else:
        smiles = mu.load_smiles_and_data_df(params['data_file'], MAX_LEN)

    if 'limit_data' in params.keys():
        sample_idx = np.random.choice(np.arange(len(smiles)), params['limit_data'], replace=False)
        smiles=list(np.array(smiles)[sample_idx])
        if params['do_prop_pred'] and ('data_file' in params):
            if "reg_prop_tasks" in params:
                Y_reg =  Y_reg[sample_idx]
            if "logit_prop_tasks" in params:
                Y_logit =  Y_logit[sample_idx]

    print('Training set size is', len(smiles))
    print('first smiles: \"', smiles[0], '\"')
    print('total chars:', NCHARS)

    print('Vectorization...')
    X = mu.smiles_to_hot(smiles, MAX_LEN, params[
                             'PADDING'], CHAR_INDICES, NCHARS)

    print('Total Data size', X.shape[0])
    if np.shape(X)[0] % params['batch_size'] != 0:
        X = X[:np.shape(X)[0] // params['batch_size']
              * params['batch_size']]
        if params['do_prop_pred']:
            if "reg_prop_tasks" in params:
                Y_reg = Y_reg[:np.shape(Y_reg)[0] // params['batch_size']
                      * params['batch_size']]
            if "logit_prop_tasks" in params:
                Y_logit = Y_logit[:np.shape(Y_logit)[0] // params['batch_size']
                      * params['batch_size']]

    np.random.seed(params['RAND_SEED'])
    rand_idx = np.arange(np.shape(X)[0])
    np.random.shuffle(rand_idx)

    TRAIN_FRAC = 1 - params['val_split']
    num_train = int(X.shape[0] * TRAIN_FRAC)

    if num_train % params['batch_size'] != 0:
        num_train = num_train // params['batch_size'] * \
            params['batch_size']

    train_idx, test_idx = rand_idx[: int(num_train)], rand_idx[int(num_train):]

    if 'test_idx_file' in params.keys():
        np.save(params['test_idx_file'], test_idx)

    X_train, X_test = X[train_idx], X[test_idx]
    print('shape of input vector : {}', np.shape(X_train))
    print('Training set size is {}, after filtering to max length of {}'.format(
        np.shape(X_train), MAX_LEN))

    if params['do_prop_pred']:
        # !# add Y_train and Y_test here
        Y_train = []
        Y_test = []
        if "reg_prop_tasks" in params:
            Y_reg_train, Y_reg_test = Y_reg[train_idx], Y_reg[test_idx]
            Y_train.append(Y_reg_train)
            Y_test.append(Y_reg_test)
        if "logit_prop_tasks" in params:
            Y_logit_train, Y_logit_test = Y_logit[train_idx], Y_logit[test_idx]
            Y_train.append(Y_logit_train)
            Y_test.append(Y_logit_test)

        return X_train, X_test, Y_train, Y_test

    else:
        return X_train, X_test


def load_models(params):

    def identity(x):
        return K.identity(x)

    # def K_params with kl_loss_var
    kl_loss_var = K.variable(params['kl_loss_weight'])

    if params['reload_model'] == True:
        encoder = load_encoder(params)
        decoder = load_decoder(params)
        varlayer = load_varLayer(params)
    else:
        encoder = encoder_model(params)
        decoder = decoder_model(params)
        varlayer = varLayer(params)

    x_in = encoder.inputs[0]

    z_mean, enc_output = encoder(x_in)
    #z_samp, z_mean_log_var_output = varlayer([z_mean, enc_output])
    z_log_var, z_mean_log_var_output = varlayer([z_mean, enc_output])


    def sampling(args):
        z_mean, z_log_var = args

        epsilon = K.random_normal_variable(shape=(params['batch_size'], params['hidden_dim']),
                                           mean=0., scale=1.)
        # insert kl loss here

        z_rand = z_mean + K.exp(z_log_var / 2) * kl_loss_var * epsilon
        return K.in_train_phase(z_rand, z_mean)

    z_samp = Lambda(sampling)([z_mean, z_log_var])

    if params['batchnorm_vae']:
        z_samp = BatchNormalization(axis=-1)(z_samp)


    #z_samp, z_mean_log_var_output = variational_layers(z_mean, enc_output, kl_loss_var, params)
    z_mean_log_var_output = Lambda(identity, name='z_mean_log_var')(z_mean_log_var_output)
    # Decoder
    if params['do_tgru']:
        x_out = decoder([z_samp, x_in])
    else:
        x_out = decoder(z_samp)

    x_out = Lambda(identity, name='x_pred')(x_out)
    model_outputs = [x_out, z_mean_log_var_output]

    AE_only_model = Model(x_in, model_outputs)

    if params['do_prop_pred']:
        if params['reload_model'] == True:
            property_predictor = load_property_predictor(params)
        else:
            property_predictor = property_predictor_model(params)

        if (('reg_prop_tasks' in params) and (len(params['reg_prop_tasks']) > 0 ) and
                ('logit_prop_tasks' in params) and (len(params['logit_prop_tasks']) > 0 )):

            reg_prop_pred, logit_prop_pred   = property_predictor(z_mean)
            reg_prop_pred = Lambda(identity, name='reg_prop_pred')(reg_prop_pred)
            logit_prop_pred = Lambda(identity, name='logit_prop_pred')(logit_prop_pred)
            model_outputs.extend([reg_prop_pred,  logit_prop_pred])

        # regression only scenario
        elif ('reg_prop_tasks' in params) and (len(params['reg_prop_tasks']) > 0 ):
            reg_prop_pred = property_predictor(z_mean)
            reg_prop_pred = Lambda(identity, name='reg_prop_pred')(reg_prop_pred)
            model_outputs.append(reg_prop_pred)

        # logit only scenario
        elif ('logit_prop_tasks' in params) and (len(params['logit_prop_tasks']) > 0 ):
            logit_prop_pred = property_predictor(z_mean)
            logit_prop_pred = Lambda(identity, name='logit_prop_pred')(logit_prop_pred)
            model_outputs.append(logit_prop_pred)

        else:
            raise ValueError('no logit tasks or regression tasks specified for property prediction')

        # making the models:
        AE_PP_model = Model(x_in, model_outputs)
        return AE_only_model, AE_PP_model, encoder, decoder, varlayer, property_predictor, kl_loss_var

    else:
        return AE_only_model, encoder, decoder, varlayer, kl_loss_var


def kl_loss(truth_dummy, x_mean_log_var_output):
    x_mean, x_log_var = tf.split(x_mean_log_var_output, 2, axis=1)
    print('x_mean shape in kl_loss: ', x_mean.get_shape())
    kl_loss = - 0.5 * \
        K.mean(1 + x_log_var - K.square(x_mean) -
              K.exp(x_log_var), axis=-1)
    return kl_loss


In [5]:


params = hyperparameters.load_params('../models/zinc/exp.json')
print("All params:", params)

Using hyper-parameters:
name                      - zinc        
MAX_LEN                   - 120         
data_file                 - 250k_rndm_zinc_drugs_clean_3.csv
char_file                 - zinc.json   
encoder_weights_file      - zinc_encoder.h5
decoder_weights_file      - zinc_decoder.h5
varlayer_weights_file     - zinc_varlayer.h5
test_idx_file             - test_idx.npy
history_file              - history.csv 
checkpoint_path           - ./          
do_prop_pred              - False       
TRAIN_MODEL               - True        
ENC_DEC_TEST              - False       
PADDING                   - right       
RAND_SEED                 - 42          
epochs                    - 10          
vae_annealer_start        - 29          
dropout_rate_mid          - 0.08283292970479479
anneal_sigmod_slope       - 0.5106654305791392
recurrent_dim             - 488         
batch_size                - 126         
lr                        - 0.00039192162392520126
hidden_dim           

In [12]:
params['char_file'] = '../models/zinc/zinc.json'
params['data_file'] = '../models/zinc/250k_rndm_zinc_drugs_clean_3.csv'
params['reload_model'] = False

In [13]:
start_time = time.time()

X_train, X_test = vectorize_data(params)
AE_only_model, encoder, decoder, varlayer, kl_loss_var = load_models(params)

# compile models
if params['optim'] == 'adam':
    optim = Adam(lr=params['lr'], beta_1=params['momentum'])
elif params['optim'] == 'rmsprop':
    optim = RMSprop(lr=params['lr'], rho=params['momentum'])
elif params['optim'] == 'sgd':
    optim = SGD(lr=params['lr'], momentum=params['momentum'])
else:
    raise NotImplemented("Please define valid optimizer")

model_losses = {'x_pred': params['loss'],
                'z_mean_log_var': kl_loss}

# vae metrics, callbacks
vae_sig_schedule = partial(mol_cb.sigmoid_schedule, slope=params['anneal_sigmod_slope'],
                           start=params['vae_annealer_start'])
vae_anneal_callback = mol_cb.WeightAnnealer_epoch(
        vae_sig_schedule, kl_loss_var, params['kl_loss_weight'], 'vae' )

csv_clb = CSVLogger(params["history_file"], append=False)
callbacks = [ vae_anneal_callback, csv_clb]


def vae_anneal_metric(y_true, y_pred):
    return kl_loss_var

xent_loss_weight = K.variable(params['xent_loss_weight'])
model_train_targets = {'x_pred':X_train,
            'z_mean_log_var':np.ones((np.shape(X_train)[0], params['hidden_dim'] * 2))}
model_test_targets = {'x_pred':X_test,
    'z_mean_log_var':np.ones((np.shape(X_test)[0], params['hidden_dim'] * 2))}

AE_only_model.compile(loss=model_losses,
    loss_weights=[xent_loss_weight,
      kl_loss_var],
    optimizer=optim,
    metrics={'x_pred': ['categorical_accuracy',vae_anneal_metric]}
    )

keras_verbose = params['verbose_print']


Training set size is 249455
first smiles: " CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1 "
total chars: 35
Vectorization...
Total Data size 249455
shape of input vector : {} (224406, 120, 35)
Training set size is (224406, 120, 35), after filtering to max length of 120



From /home/hengshi/.conda/envs/chemvaeLower/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:432: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.






From /home/hengshi/.conda/envs/chemvaeLower/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3535: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.






From /home/hengshi/.conda/envs/chemvaeLower/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:113: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



Instructions for updating:
keep_dims is deprecated, use keepdims instead


From /home/hengshi/.conda/envs/chemvaeLower/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:1210: calling reduce_prod_v1 (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


From /home/hengshi/.conda/envs/chemvaeLower/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:2878: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
keep_dims is deprecated, use keepdims instead


From /home/hengshi/.conda/envs/chemvaeLower/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:1192: calling reduce_sum_v1 (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead





From /gpfs/accounts/welchjd_root/welchjd/hengshi/GAN/perturb_gan/chemical_vae-master_newTrain/chemvae_newTrain_train/sampled_rnn_tf.py:64: The name tf.set_random_seed is deprecated. Please use tf.compat.v1.set_random_seed instead.



Instructions for updating:
keep_dims is deprecated, use keepdims instead


From /home/hengshi/.conda/envs/chemvaeLower/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:1156: calling reduce_max_v1 (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


From /home/hengshi/.conda/envs/chemvaeLower/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:794: calling RandomNormal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor





From /home/hengshi/.conda/envs/chemvaeLower/lib/python3.6/site-packages/keras/optimizers.py:697: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.






From /home/hengshi/.conda/envs/chemvaeLower/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:2749: The name tf.log is deprecated. Please use tf.math.log instead.



x_mean shape in kl_loss:  (?, 196)


In [16]:
K.learning_phase()

<tf.Tensor 'encoder_norm0/keras_learning_phase:0' shape=<unknown> dtype=bool>

In [18]:
a = K.in_train_phase(10, 100)

In [20]:
a._uses_learning_phase

True

In [24]:
(K.learning_phase())

<tf.Tensor 'Print_1:0' shape=<unknown> dtype=bool>