In [3]:
import numpy as np
import pandas as pd
from utils import *
import random
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from scipy import stats


from keras.models import Model
from keras.models import load_model
from keras.layers import Input, Dense, Reshape, merge, Dot, Concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# 1. Data Preprocessing

In [None]:
df = pd.read_csv('zinc_4325992_wash.csv')

In [None]:
df.head()

In [None]:
from IPython.display import display, SVG
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D

def show(mol,molSize=(475,175),kekulize=True):
    mc = Chem.Mol(mol.ToBinary())
    if kekulize:
        try:
            Chem.Kekulize(mc)
        except:
            mc = Chem.Mol(mol.ToBinary())
    assert mc.GetNumConformers() > 0
    drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])
    drawer.DrawMolecule(mc)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    display(SVG(svg.replace('svg:','')))

In [None]:
length = len(df)
print("numbers: ", length)
smiles = df["mol"]
smileslist = list(smiles)

In [None]:
validatedsmiles = []

notvalidated = 0
for smile in smileslist:
    m = Chem.MolFromSmiles(smile)
    if m is None:
        notvalidated += 1
        if notvalidated % 10000 == 0:
            #print("not validated:", notvalidated)
            print("not validated:", smile)
    else:
        validatedsmiles.append(smile)
print("non validated smiles:", notvalidated)
print("not validate smiles rate:", notvalidated/length)

In [None]:
data = '\n'.join(validatedsmiles)
#data= data.lower().strip()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))
print("first smiles: ", smileslist[0])
print(chars)

In [None]:
lensmile = [len(x) for x in validatedsmiles]

print("minimum:", np.min(lensmile))
print("mean:", np.mean(lensmile))
print("std:", np.std(lensmile))
print("maximum:",np.max(lensmile))

plt.hist(lensmile, bins=500, facecolor='r',histtype='stepfilled')
#plt.plot(x,y,color='b')   
 
plt.xlabel('Length')
plt.ylabel('Number of Lenth')
plt.title('Distribution of Smiles Length')
plt.savefig('histogramDescription.pdf', format='pdf', dpi=300)

In [None]:
print(validatedsmiles[np.argmin(lensmile)])
m = Chem.MolFromSmiles(validatedsmiles[np.argmin(lensmile)])
AllChem.Compute2DCoords(m)
show(m)

In [None]:
print(validatedsmiles[np.argmax(lensmile)])
m = Chem.MolFromSmiles(validatedsmiles[np.argmax(lensmile)])
AllChem.Compute2DCoords(m)
show(m)

In [None]:
type(validatedsmiles)

In [None]:
char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)
print(char_to_ix)

In [None]:
validatedsmiles_pd = pd.DataFrame(validatedsmiles, columns=['smiles'])

In [None]:
validatedsmiles_pd.to_csv('validatedsmiles_pd.csv', encoding='utf-8', index=False, header=True)

## Read validated smils from csv

In [None]:
smiles = pd.read_csv('validatedsmiles_pd.csv')

In [None]:
smiles.head()

In [None]:
lensmile = [len(x) for x in smiles['smiles']]

print("minimum:", np.min(lensmile))
print("mean:", np.mean(lensmile))
print("std:", np.std(lensmile))
print("maximum:",np.max(lensmile))

In [None]:
mu = np.mean(lensmile)
mu

sigma = np.std(lensmile)
sigma

interval = stats.norm.interval(0.98, mu, sigma) 
interval

In [None]:
## add lens to data frame

#smiles['lens'] = pd.Series(lensmile)

### Filter smiles only contains len(smiles) <= 60

In [None]:
finalsmiles = smiles[(lensmile >= interval[0]) & (lensmile <= interval[1])]

In [None]:
finalsmiles.to_csv('finalsmiles_pd.csv', encoding='utf-8', index=False, header=True)

# 2. Smiles to Index

## Read smiles from data

In [7]:
smiles_pd = pd.read_csv('finalsmiles_pd.csv')
smiles_pd.head()

Unnamed: 0,smiles,lens
0,S([C@@H](C(=O)N1CCCC1)C)C1=NC(=O)c2c(N1)ccc(F)c2,48
1,O=C(NCC1(Cn2nccc2)CC1)c1nnc([O-])c2c1cccc2,42
2,S(=O)(=O)(CCN[C@@H]([C@H](C)n1nccc1)C)c1ccccc1,46
3,S(CCC(=O)N1CCNCC1)Cc1ccc(C)cc1,30
4,Fc1c(CNC2CCN(C(=O)c3cocc3)CC2)c(F)ccc1,38


In [8]:
smiles_pd.shape

(2883247, 2)

### (Optional) Keras Tokenizer by Char 

In [None]:
t = Tokenizer(lower=False, char_level=True, split='')
t.fit_on_texts(data)
vocab_size = len(t.word_index) + 1
print("vocab size: " + str(vocab_size))
#word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

word_to_index = t.word_index
print(word_to_index)

index_to_word = {v: k for k, v in word_to_index.items()}

encoded_X = t.texts_to_sequences(data)

In [None]:
encoded_X[3]

## Smiles to Char list

In [11]:
smileslist = smiles_pd['smiles']

In [12]:
datawithnextline = '\n'.join(smileslist)
data = ''.join(smileslist)

In [13]:
#chars = set(data)
def precess_smiles_as_charlist(data):

    allchars = []

    i = 0

    while i < len(data):

        if data[i] == 'B' and data[i+1] == 'r':
            allchars.append('Br')
            i += 1

        elif data[i] == 'C' and data[i+1] == 'l':
            allchars.append('Cl')
            i += 1

        else:    
            allchars.append(data[i])

        i += 1
    
    return allchars

## smiles to char list without '\n'

In [14]:
allchars = precess_smiles_as_charlist(data)
chars = set(allchars)

## smiles to char list with '\n'

In [15]:
allcharswithn = precess_smiles_as_charlist(datawithnextline)

### Dicts for char_to_index and index_to_char

* Atom: C H O N S P F Cl Br I B c o n s p
* bond: . = # : $
* electonric: + -
* @ \ /
* () []
* 1 2 3 4 5 6 7 8 9 %
* Other

In [16]:
char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)
#print(char_to_ix)

{0: '#', 1: '(', 2: ')', 3: '+', 4: '-', 5: '/', 6: '0', 7: '1', 8: '2', 9: '3', 10: '4', 11: '5', 12: '6', 13: '=', 14: '@', 15: 'Br', 16: 'C', 17: 'Cl', 18: 'F', 19: 'H', 20: 'I', 21: 'N', 22: 'O', 23: 'P', 24: 'S', 25: '[', 26: '\\', 27: ']', 28: 'c', 29: 'n', 30: 'o', 31: 'p', 32: 's'}


We dont have elements: 
.  : $ 7 8 9 Other , where '\\\' is '\'

## Use char_to_ix to indexing

In [19]:
allsmilesindexlist = []
smileindexlist = []

for char in allcharswithn:
    
    if char != '\n':
        smileindexlist.append(char_to_ix[char])
    else:
        allsmilesindexlist.append(smileindexlist)
        smileindexlist = []

In [20]:
len(allsmilesindexlist)

2883246

In [21]:
allsmilesindexlist[0]

[24,
 1,
 25,
 16,
 14,
 14,
 19,
 27,
 1,
 16,
 1,
 13,
 22,
 2,
 21,
 7,
 16,
 16,
 16,
 16,
 7,
 2,
 16,
 2,
 16,
 7,
 13,
 21,
 16,
 1,
 13,
 22,
 2,
 28,
 8,
 28,
 1,
 21,
 7,
 2,
 28,
 28,
 28,
 1,
 18,
 2,
 28,
 8]

In [22]:
window_size = 8
vector_dim = 200
vocab_size = len(char_to_ix)

In [23]:
s_couples, s_labels = skipgrams(allsmilesindexlist[0], vocab_size, window_size=window_size)

### Skip-gram (only 50000 molecualrs, require more)

In [24]:
#sampling_table = sequence.make_sampling_table(vocab_size)
couples = []
labels = []

i = 0
for smileindex in allsmilesindexlist:
   
    s_couples, s_labels = skipgrams(smileindex, vocab_size, negative_samples = 1, window_size=window_size)
   
    couples += s_couples
    labels += s_labels
    
    i += 1
    if i % 10000 == 0:
        print('process in smile:', i)
        
    if i == 50000:
        break

process in smile: 10000
process in smile: 20000
process in smile: 30000
process in smile: 40000
process in smile: 50000


In [25]:
len(couples)

59698664

In [26]:
len(labels)

59698664

In [27]:
couples[:100]

[[2, 12],
 [14, 23],
 [28, 18],
 [14, 14],
 [16, 1],
 [16, 25],
 [28, 1],
 [2, 1],
 [16, 13],
 [13, 11],
 [7, 2],
 [7, 14],
 [14, 19],
 [21, 8],
 [16, 25],
 [13, 28],
 [1, 14],
 [7, 2],
 [7, 14],
 [16, 10],
 [2, 12],
 [28, 2],
 [18, 28],
 [28, 21],
 [1, 16],
 [22, 12],
 [2, 6],
 [2, 16],
 [2, 21],
 [2, 22],
 [16, 13],
 [27, 16],
 [14, 15],
 [28, 32],
 [8, 19],
 [13, 16],
 [13, 2],
 [2, 7],
 [21, 4],
 [28, 21],
 [16, 7],
 [28, 8],
 [2, 25],
 [27, 22],
 [13, 13],
 [21, 20],
 [2, 22],
 [7, 7],
 [18, 22],
 [8, 13],
 [16, 15],
 [22, 12],
 [16, 8],
 [28, 7],
 [13, 28],
 [28, 28],
 [21, 30],
 [1, 2],
 [28, 22],
 [27, 24],
 [1, 14],
 [27, 12],
 [28, 18],
 [1, 18],
 [14, 22],
 [13, 22],
 [28, 10],
 [28, 28],
 [21, 24],
 [16, 28],
 [28, 8],
 [7, 28],
 [2, 32],
 [28, 28],
 [13, 3],
 [1, 14],
 [28, 28],
 [25, 24],
 [22, 21],
 [24, 27],
 [14, 5],
 [16, 21],
 [21, 26],
 [25, 3],
 [28, 26],
 [28, 5],
 [16, 16],
 [1, 2],
 [16, 32],
 [1, 19],
 [16, 1],
 [1, 12],
 [27, 7],
 [1, 13],
 [16, 2],
 [22, 22],

In [28]:
labels[:100]

[0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0]

In [29]:
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

In [30]:
print(len(word_target))
print(len(word_context))
print(word_target[:10], word_context[:10], labels[:10])

59698664
59698664
[ 2 14 28 14 16 16 28  2 16 13] [12 23 18 14  1 25  1  1 13 11] [0, 0, 0, 0, 1, 0, 1, 1, 0, 0]


##  Skipgram Model

In [31]:
# create some input variables
input_target = Input((1,))
input_context = Input((1,))

## embedding layer
embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')

## target
target = embedding(input_target)

## context
context = embedding(input_context)

## merge context and target
dot_product = Concatenate(axis=2)([target, context])
dot_product = Reshape((400, ))(dot_product)

## predict
output = Dense(1, activation='sigmoid')(dot_product)

model = Model(input=[input_target, input_context], output=output)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 200)       6600        input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 1, 400)       0           embedding[0][0]                  
          



## Embeding Model

In [32]:
input_molecular = Input((1,))

embeddedMolecular = embedding(input_molecular)

embedding_model = Model(input=input_molecular, output=embeddedMolecular)

embedding_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
embedding (Embedding)        (None, 1, 200)            6600      
Total params: 6,600
Trainable params: 6,600
Non-trainable params: 0
_________________________________________________________________


  """


In [33]:
model.compile(loss='binary_crossentropy', optimizer='adam')

## Trainning (Require more epochs)

In [35]:
model.fit(x = [word_target, word_context], y = labels, epochs = 1, batch_size=128)

Epoch 1/1


<keras.callbacks.History at 0x334f630f0>

In [36]:
embedding_model.save('mole2vec-skipgram.h5')

In [37]:
embedding_model = load_model('mole2vec-skipgram.h5')



In [38]:
def cosine_similarity(u, v):
    """
    Cosine similarity reflects the degree of similariy between u and v
        
    Arguments:
        u -- a word vector of shape (n,)          
        v -- a word vector of shape (n,)

    Returns:
        cosine_similarity -- the cosine similarity between u and v defined by the formula above.
    """
    
    distance = 0.0
    
    ### START CODE HERE ###
    # Compute the dot product between u and v (≈1 line)
    dot = np.sum(u*v)
    # Compute the L2 norm of u (≈1 line)
    norm_u = np.sqrt(np.sum(np.square(u)))   
    # Compute the L2 norm of v (≈1 line)
    norm_v = np.sqrt(np.sum(np.square(v)))
    # Compute the cosine similarity defined by formula (1) (≈1 line)
    cosine_similarity = dot / (norm_u*norm_v)
    ### END CODE HERE ###
    
    return cosine_similarity

In [39]:
s1 = embedding_model.predict([char_to_ix['(']]) -  embedding_model.predict([char_to_ix[')']])
s2 = embedding_model.predict([char_to_ix['[']]) -  embedding_model.predict([char_to_ix[']']])

cosine_similarity(s1, s2)

-0.022250967

In [40]:
embedding_model.predict([char_to_ix['Cl']])

array([[[ 7.14578703e-02, -3.98030207e-02,  5.57898581e-02,
          2.50560809e-02, -3.30043361e-02, -1.08692139e-01,
          6.14518039e-02, -6.20625168e-02, -2.32791994e-03,
         -4.69901003e-02,  9.74946171e-02, -2.63790395e-02,
         -5.48885670e-03, -2.37427279e-03, -1.07383849e-02,
          3.64497080e-02,  4.25537936e-02,  1.39434161e-02,
          1.32589504e-01,  1.78124346e-02,  8.45811982e-03,
          9.87131894e-02, -1.10381786e-02,  3.94414179e-02,
         -2.99292728e-02,  1.76506913e+00, -1.74931195e-02,
         -2.59747487e-02, -8.26530978e-02, -1.30458577e-02,
          7.19384775e-02, -7.52303526e-02, -9.00663063e-02,
          8.07701051e-02,  8.69659148e-03, -4.14740667e-02,
         -1.18514530e-01, -1.65126529e-02, -1.80607289e-02,
          6.97746575e-02,  2.34687328e-01, -8.18871409e-02,
          1.91780627e+00,  3.13647762e-02, -3.44711915e-02,
          6.19419180e-02,  1.51685387e-01, -2.41593383e-02,
         -9.37212445e-03, -8.78912061e-0