<a href="https://colab.research.google.com/github/umang66782/Plaksha/blob/main/Drug_design.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Generation of new chemical Structure using convolution Neural Network and Variational Autoencoder.

Chemical structures are
represented using SMILES strings (see https://en.wikipedia.org/
wiki/Simplified_molecular-input_line-entry_system) and you have
access to structures that exist in the ZINC database (see, https:
//www.ncbi.nlm.nih.gov/pmc/articles/PMC1360656/).
Ideally, new structures have to be screened but here we will only imlement the part that is responsible for generating new structures. 

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf 
import matplotlib
import matplotlib.pyplot as plt 
import matplotlib.patches as mpatches 
%matplotlib inline
import keras
from keras import layers
from keras.models import Model 
from keras import metrics
from keras import backend as K 
import tensorflow.compat.v1.keras.backend as K 
tf.compat.v1.disable_eager_execution()
from tensorflow.keras.optimizers import Adam 
import warnings 
warnings.filterwarnings("ignore")

In [None]:
!pip install kora -q
import kora.install.rdkit
from rdkit import Chem

DataSet References : - http://zinc.docking.org/tranches/home/

In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('/content/drive/MyDrive/dataset.csv')
data

Unnamed: 0,zinc_id,smiles
0,ZINC000000008151,C[C@H]1[C@@H](O)[C@H](CO)O[C@@H](O)[C@@H]1N
1,ZINC000000008153,CC[C@@H]1[C@@H](N)[C@@H](O)O[C@@H](CO)[C@@H]1O
2,ZINC000000008155,CC1(C)[C@@H](N)[C@@H](O)O[C@@H](CO)[C@@H]1O
3,ZINC000000018276,CS[C@@H]1CN[C@@H](CO)[C@H](O)[C@H]1O
4,ZINC000000018279,CS[C@@H]1[C@@H](O)CN[C@@H](CO)[C@@H]1O
...,...,...
427851,ZINC000242463989,O[Cl+3](O)(O)O
427852,ZINC000247713634,O1[SiH2][SiH2]O[SiH2][SiH2]1
427853,ZINC000252581626,O[Si](O)(O)F
427854,ZINC000685945533,Cn1nnnc1S(=O)(=O)F


To convert smiles unique character to numpy array. Use Encoder for encode smiles and decoder is used after getting prediction from VAE decoder the reverse it into numpy aarray to smiles.

References :-https://iwatobipen.wordpress.com/2017/01/22/encode-and-decode-smiles-strings/

In [None]:
SMILES_CHARS = [' ',
                  '#', '%', '(', ')', '+', '-', '.', '/',
                  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                  '=', '@',
                  'A', 'B', 'C', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P',
                  'R', 'S', 'T', 'V', 'X', 'Z',
                  '[', '\\', ']',
                  'a', 'b', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's',
                  't', 'u']
smi2index = dict( (c,i) for i,c in enumerate( SMILES_CHARS ) )
index2smi = dict( (i,c) for i,c in enumerate( SMILES_CHARS ) )

## Smile Encoder 
Use to take Smiles Character from datasets as input 

In [None]:
def smiles_encoder( smiles, maxlen=62 ):
    smiles = Chem.MolToSmiles(Chem.MolFromSmiles( smiles ))
    z= np.zeros( ( maxlen, len( SMILES_CHARS ) ) )
    for i, c in enumerate( smiles ):
        z[i, smi2index[c]] = 1
    return z

## Smile Decoder 
Use for decoding the array of smiles to smiles charater after VAE predict.

In [None]:
def smiles_decoder( z ):
    smi = ''
    z = z.argmax( axis=-1 )
    for i in z:
        smi += index2smi[i]
    return smi

Use one hot encoding cover the smiles character in numpy aarray 

In [None]:
p = []
for i in data['smiles'][:50000]:
    p.append(smiles_encoder(i))
a = np.array(p)

In [None]:
a.shape

(50000, 62, 56)

Reshaping is required for adding channels 

In [None]:
a= a.reshape(-1,62,56,1)

In [None]:
a.shape

(50000, 62, 56, 1)

In [None]:
a

array([[[[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        ...,

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]]],


       [[[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        ...,

        [[0.],
 

## Encoder 
Take input after coverting of smile character in numpy array 

In [None]:
data_shape = (62, 56,1)
batch_size = 256
latent_dim = 2
input_data = keras.Input(shape=data_shape)
x = layers.Conv1D(128, 7,padding='same',activation='relu')(input_data)
x = layers.Conv2D(128, 3,padding='same',activation='relu',strides=(2, 2))(x)
x = layers.Conv2D(128, 3,padding='same',activation='relu')(x)
x = layers.Conv2D(128, 3,padding='same',activation='relu')(x)
shape_flattening = K.int_shape(x)
x = layers.Flatten()(x)
x = layers.Dense(32, activation='sigmoid')(x)
mu = layers.Dense(latent_dim)(x)
log_sigma = layers.Dense(latent_dim)(x)

In [None]:
shape_flattening

(None, 31, 28, 128)

In [None]:
x.shape

TensorShape([None, 32])

In [None]:
mu.shape

TensorShape([None, 2])

In [None]:
log_sigma.shape

TensorShape([None, 2])

## Latent Space
MU and Sigma sampling from encoder 

In [None]:
def sampling(args):
  mu, log_sigma = args
  epsilon = K.random_normal(shape=(K.shape(mu)[0], latent_dim),
  mean=0., stddev=1.)
  return mu + K.exp(log_sigma) * epsilon
X = layers.Lambda(sampling)([mu, log_sigma])

In [None]:
X.shape

TensorShape([None, 2])

In [None]:
mu.shape

TensorShape([None, 2])

In [None]:
log_sigma.shape 

TensorShape([None, 2])

Two Dimesional mu and sigma get from sampling from encoder 

## Decoder 
Decoder take input from sampling vector obtain using mu and sigma distribution 

In [None]:
decoder_input = layers.Input(K.int_shape(X)[1:])
x = layers.Dense(np.prod(shape_flattening[1:]),
                 activation='relu')(decoder_input)
x = layers.Reshape(shape_flattening[1:])(x)
x = layers.Conv2DTranspose(32, 3,
                           padding='same',
                           activation='relu',
                           strides=(2, 2))(x)
x = layers.Conv2D(1, 3,
                  padding='same',
                  activation='sigmoid')(x)
decoder = Model(decoder_input, x)
X_decoded = decoder(X)

In [None]:
decoder_input.shape

TensorShape([None, 2])

In [None]:
X_decoded.shape 

TensorShape([None, 62, 56, 1])

In [None]:
class CVariationalLayer(keras.layers.Layer):
    def vae_loss(self, x, X_decoded):
        x = K.flatten(x)
        X_decoded = K.flatten(X_decoded)
        #Reconstrucrtion_loss
        rcon_loss = keras.metrics.binary_crossentropy(x, X_decoded)
        #Regularisation  
        kl_loss = -5e-4 * K.mean(1 + log_sigma - K.square(mu) - K.exp(log_sigma), axis=-1) 
        #complete Losss
        return K.mean(rcon_loss + kl_loss)

    def call(self, inputs):
        x = inputs[0]
        X_decoded = inputs[1]
        loss = self.vae_loss(x, X_decoded)
        self.add_loss(loss, inputs=inputs)
        return x

Con_var = CVariationalLayer()([input_data, X_decoded])

## Vae Model

In [None]:
vae = Model(input_data, Con_var)
vae.compile(optimizer='adam')
vae.summary()

Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           [(None, 62, 56, 1)]  0                                            
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 62, 56, 128)  1024        input_11[0][0]                   
__________________________________________________________________________________________________
conv2d_20 (Conv2D)              (None, 31, 28, 128)  147584      conv1d_5[0][0]                   
__________________________________________________________________________________________________
conv2d_21 (Conv2D)              (None, 31, 28, 128)  147584      conv2d_20[0][0]                  
___________________________________________________________________________________________

 From Vae Summary : Input and Ouput dimension is same.

## Vae fit

In [None]:
vae.fit(a,shuffle=True,epochs=10)

Train on 50000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f8ec00ccdd8>

## Decoder Prediction 

In [None]:
sample_vec = np.array([[0,5]])
pred = decoder.predict(sample_vec)

In [None]:
pred

array([[[[4.07085499e-05],
         [5.72561294e-05],
         [1.13099530e-08],
         ...,
         [6.66556730e-07],
         [5.88761282e-07],
         [6.35818253e-07]],

        [[3.27630150e-05],
         [4.55736332e-02],
         [3.34496386e-09],
         ...,
         [3.92282715e-07],
         [1.65238745e-07],
         [1.96868157e-08]],

        [[3.95061170e-06],
         [1.52238389e-03],
         [3.10878256e-08],
         ...,
         [3.55010421e-08],
         [3.48444864e-08],
         [3.26791536e-08]],

        ...,

        [[1.05127242e-07],
         [5.06047870e-09],
         [9.67321001e-10],
         ...,
         [3.18951834e-08],
         [3.78826037e-08],
         [1.90219573e-08]],

        [[1.22041133e-08],
         [4.21553598e-10],
         [3.19906671e-11],
         ...,
         [1.06115134e-08],
         [6.34982955e-09],
         [1.39910439e-08]],

        [[1.13768147e-05],
         [8.00145699e-07],
         [1.15484134e-07],
         ...,
 

Taking Random Input and get the predection from decoder and covert into smiles character

In [None]:
pred1[0].reshape(62,56).shape

(62, 56)

In [None]:
import random
pred = []
for i in random.sample(range(1, 1000), 10):
  for j in random.sample(range(1, 1000), 10):
    sample_vec = np.array([[i,j]])
    pred1 = decoder.predict(sample_vec)
    pred.append( (smiles_decoder(pred1[0].reshape(62,56)).replace(" ", "") ))
 
set(pred)

{'O=C1@@@@@@[',
 'O=C1@@@@C@@[',
 'O=C1@@@@[',
 'O=C1@@@C@@[CH',
 'O=C1@@@[',
 'O=C1@@C@@[',
 'O=C1@@H@@@@@@@@@[]',
 'O=C1@@H@@@@@@@@[',
 'O=C1@@H@@@@@@C@@[',
 'O=C1@@H@@@@H@@@C@@[',
 'O=C1@@H@@@@H@@@C@@[]',
 'O=C1@@H@@@@H@@@]@@[]',
 'O=C1@@H@@@@HH2(@@@@@CH@@@)[CCHH]2C',
 'O=C1@@H@@@@HH2(@@@@@CH@@@C[CCHH]22C11@2',
 'O=C1@@H@@@@HH2(@@@@@CH@@@C[CC]H]22C11@@@222',
 'O=C1@@H@@@@HH2[@@@@@CH@@@)[CCHH]22C1C',
 'O=C1@@H@@@@HH2[@@@@@CH@@@C[CCHH]22C1C@@@22',
 'O=C1@@H@@@@HHN[@@@@@CC@@@C[CCHH]22C1C@@@2222111',
 'O=C1@@H@@@@HHN[@@@@@CCC@@C[CCHH]2CC1C@@@2222111112111',
 'O=C1@@H@@@@H[[@@@@@CCH@@)[CCH]',
 'O=C1@@H@@@HH[@@@]@CH@@[CCH@',
 'O=C1@@H@@@HH[C[@@@]@CC@@@C[CCHH@NCC1C@@@2@222111121',
 'O=C1@@H@@@HH[C[@@@]@CCH@@C[CCHH@',
 'O=C1@@H@@@HH[C[@@@]@CCH@@C[CCHH@NCCCC@@@2@22211',
 'O=C1@@H@@@HH[N[@@@]@CCH@@C[CCHH@C',
 'O=C1@@H@@@HH[[@@@]@CCH@@)[CCH@',
 'O=C1@@H@@@HH[[@@@]@CCH@@C[CCHH@',
 'O=C1@@H@@@H[@@@]@C@@[CH',
 'O=C1@@]@@@@@H(([@@@@]C@@@C[CC@H]22C11@@22',
 'O=C1@@]@@@@@H(@@@)]C@@@C[]',
 'O=C1@@]@@

## Final we get combination of lot of molecule.