## Novel Molecule Generation using Bidirectional-Recurrent Neural Networks with Attention Applied to Simplified Molecular Input Line Entry Sysem (SMILES)

## Train

author: anthony j. vasquez
email: vanthony715@gmail.com / avasque1@jh.edu

In [1]:
import time
from datetime import datetime
t0 = time.time()

import sys
sys.path.append('./data/')

import gc
gc.collect()

import os
##had a hard time with this setting on windows os using spyder and jypyter
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import warnings
warnings.filterwarnings("ignore")

import platform
print("Operating System: ", platform.system())
print("Machine Type:: ", platform.machine())

import multiprocessing as mp
max_processors = mp.cpu_count()
print('Processor Count: ', max_processors, '\n')

import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.ndimage import gaussian_filter1d
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader

# !pip install torchviz
# !pip install torchinfo
from torchinfo import summary
from torchviz import make_dot

from net import *
from utils import *
from custom_dataset import *

##hardware params
DEVICE = torch.device('cpu') #DEBUG

Operating System:  Linux
Machine Type::  x86_64
Processor Count:  24 



#### Hyperparameters

In [2]:
##data
HOLDOUT_PERC = 0.90
TRAIN_BS = 128 #train batch size
PREFETCH_FACTOR = 4 #effectively reduces gpu load time 
NUM_WORKERS = 8

##training
N_EPOCHS = 500 #num of training epochs
OPTIMIZER = 'rmsprop' ##or adam
LRG = 0.004086800025392213 #learning rate generator
LRD = 0.013448070902660135 #learning rate discriminator
LRS_SZ = 5 #learning rate scheduler step size
LRS_GAMMA = 0.99 #learning rate scheduler gamma
BETAS = (0.5, 0.999) #momentum moving average
DROPOUT_PROB = 0.5 #dropout
WEIGHT_DECAY = 1e-5 #L2 Regularization
RUN_EXTRA_TIMES = 3 #iterate over validator extra times for every one time that the generator ates
EMBEDDING_DIM = 32 #how large of a vector to represent input data
HIDDEN_DIM = 128 #learned embeddings
N_LAYERS = 2 #num gru layers
BIDIRECTIONAL = True #makes gru layer bidirectional
N_HEADS = 4 #number of heads for attention, scaled dot-product for head's respective section of sequence
CLIP_VALUE = 0.008101987508250374 ##WGAN discriminator clip value for training stabalization

##inference
N_SAMPLES = 1000
MAX_LENGTH = 10

##visualization params
SIGMA = 2 #loss smoothing for lineplot
PRINT_LOSS_EVERY = 2 #how many epochs to output loss

##logging
RESULTS_PATH = '../results/'

In [3]:
%%time

### Import the data
data_file = pd.read_csv('../data/Zinc_all_smiles_data.txt', header = None)
data_file.columns = ['smiles']
smilesList = data_file['smiles'].tolist()

##split dset
train_smiles, holdout_smiles, _, _ = train_test_split(smilesList, smilesList, test_size=HOLDOUT_PERC, random_state=42)
print('Len all smiles: ', len(smilesList))
print('Len train smiles: ', len(train_smiles))
print('Len holdout smiles: ', len(holdout_smiles))

print('\n')

Len all smiles:  249456
Len train smiles:  24945
Len holdout smiles:  224511


CPU times: user 269 ms, sys: 18.6 ms, total: 287 ms
Wall time: 287 ms


In [4]:
%%time

# Example usage:
vocab = build_vocabulary(smilesList)
max_length = max(len(tokenize_smiles(smiles)) for smiles in smilesList)

print('\n')



CPU times: user 686 ms, sys: 10.2 ms, total: 696 ms
Wall time: 695 ms


In [5]:
%%time

##define dataset
dataset = SMILESDataset(train_smiles, vocab, max_length)
dataloader = DataLoader(dataset, batch_size=TRAIN_BS, shuffle=True, pin_memory=False)

print('Data Summary: ')
print('smilesList Len: ', len(smilesList))
print('vocab len: ', len(vocab))
print('max_length: ', max_length)
print('\n')

Data Summary: 
smilesList Len:  249456
vocab len:  254593
max_length:  27


CPU times: user 239 µs, sys: 24 µs, total: 263 µs
Wall time: 260 µs


### Instantiate GAN and Discriminator Networks

In [6]:
%%time

##init networks
n_gen = Generator(vocab_size=len(vocab) + 1, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, num_layers=N_LAYERS, max_length=max_length, 
                  num_heads=N_HEADS, dropout_prob=DROPOUT_PROB, bidirectional=BIDIRECTIONAL)

n_disc = Discriminator(vocab_size=len(vocab) + 1, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, num_layers=N_LAYERS, max_length=max_length, 
                       num_heads=N_HEADS, dropout_prob=DROPOUT_PROB, bidirectional=BIDIRECTIONAL)

print('\n')



CPU times: user 83.2 ms, sys: 11 ms, total: 94.2 ms
Wall time: 93.8 ms


### Generator Network

In [7]:
print('\nTotal generator param cnt: ', count_parameters(n_gen))
print('Trainable generator param cnt: ', count_trainable_parameters(n_gen))


Total generator param cnt:  8962624
Trainable generator param cnt:  8962624


In [8]:
# show_model_details(model=n_gen)

##just get the size of the input
for real_smiles in tqdm(dataloader):
    input_size = real_smiles.shape
    example_input = real_smiles.to(DEVICE)
    print('Input Size: ', input_size)
    break
    
print(summary(n_gen, input_size=input_size))

  0%|                                                                                           | 0/195 [00:00<?, ?it/s]

Input Size:  torch.Size([128, 27])





Layer (type:depth-idx)                   Output Shape              Param #
Generator                                [128, 27]                 --
├─Embedding: 1-1                         [128, 27, 32]             8,147,008
├─GRU: 1-2                               [128, 27, 256]            420,864
├─Dropout: 1-3                           [128, 27, 256]            --
├─MultiheadAttention: 1-4                [27, 128, 256]            263,168
├─Linear: 1-5                            [128, 27, 512]            131,584
Total params: 8,962,624
Trainable params: 8,962,624
Non-trainable params: 0
Total mult-adds (G): 2.51
Input size (MB): 0.01
Forward/backward pass size (MB): 22.12
Params size (MB): 34.80
Estimated Total Size (MB): 56.93


In [9]:

# ##TODO get the graph to work. There is an everything on same device error.
# example_input = torch.randint(0, 10000, (32, 100)).long().to(DEVICE)

# # Forward pass through the model
# output = n_gen(example_input)
    
# Generate the graph
# graph = make_dot(output, params=dict(n_gen.named_parameters()))
# graph = make_dot(output, params=dict(n_gen.named_parameters()))

### Descriminator Network

In [10]:
print('\nTotal discriminator param cnt: ', count_parameters(n_disc))
print('Trainable discriminator param cnt: ', count_trainable_parameters(n_disc))


Total discriminator param cnt:  8666945
Trainable discriminator param cnt:  8666945


In [11]:
# show_model_details(model=n_disc)
print(summary(n_disc, input_size=input_size))

Layer (type:depth-idx)                   Output Shape              Param #
Discriminator                            [128, 1]                  --
├─Embedding: 1-1                         [128, 27, 32]             8,147,008
├─GRU: 1-2                               [128, 27, 256]            420,864
├─Linear: 1-3                            [128, 27, 128]            32,896
├─Dropout: 1-4                           [128, 27, 128]            --
├─MultiheadAttention: 1-5                [27, 128, 128]            66,048
├─Linear: 1-6                            [128, 1]                  129
Total params: 8,666,945
Trainable params: 8,666,945
Non-trainable params: 0
Total mult-adds (G): 2.50
Input size (MB): 0.01
Forward/backward pass size (MB): 11.50
Params size (MB): 34.40
Estimated Total Size (MB): 45.92


In [12]:
##clean and time
gc.collect()
torch.cuda.empty_cache()
tf = time.time()
print('Total Runtime: ', np.round(tf - t0, 3))

Total Runtime:  7.999
