In [10]:
import numpy as np
import pandas as pd
from utils import *
import random
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from scipy import stats


from keras.models import Model
from keras.models import load_model
from keras.layers import Input, Dense, Reshape, merge, Dot, Concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence


from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE

# 2. Smiles to Index

## Read smiles from data

In [3]:
smiles_pd = pd.read_csv('finalsmiles_pd.csv')
smiles_pd.head()

Unnamed: 0,smiles,lens
0,S([C@@H](C(=O)N1CCCC1)C)C1=NC(=O)c2c(N1)ccc(F)c2,48
1,O=C(NCC1(Cn2nccc2)CC1)c1nnc([O-])c2c1cccc2,42
2,S(=O)(=O)(CCN[C@@H]([C@H](C)n1nccc1)C)c1ccccc1,46
3,S(CCC(=O)N1CCNCC1)Cc1ccc(C)cc1,30
4,Fc1c(CNC2CCN(C(=O)c3cocc3)CC2)c(F)ccc1,38


In [4]:
smiles_pd.shape

(2883247, 2)

## Smiles to Char list

In [4]:
smileslist = smiles_pd['smiles']

In [5]:
datawithnextline = '\n'.join(smileslist)
data = ''.join(smileslist)

In [6]:
#chars = set(data)
def precess_smiles_as_charlist(data):

    allchars = []

    i = 0

    while i < len(data):

        if data[i] == 'B' and data[i+1] == 'r':
            allchars.append('Br')
            i += 1

        elif data[i] == 'C' and data[i+1] == 'l':
            allchars.append('Cl')
            i += 1

        else:    
            allchars.append(data[i])

        i += 1
    
    return allchars

## smiles to char list without '\n'

In [7]:
allchars = precess_smiles_as_charlist(data)
chars = set(allchars)

## smiles to char list with '\n'

In [None]:
allcharswithn = precess_smiles_as_charlist(datawithnextline)

### Dicts for char_to_index and index_to_char

* Atom: C H O N S P F Cl Br I B c o n s p
* bond: . = # : $
* electonric: + -
* @ \ /
* () []
* 1 2 3 4 5 6 7 8 9 %
* Other

In [11]:
char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)
#print(char_to_ix)

{0: '#', 1: '(', 2: ')', 3: '+', 4: '-', 5: '/', 6: '0', 7: '1', 8: '2', 9: '3', 10: '4', 11: '5', 12: '6', 13: '=', 14: '@', 15: 'Br', 16: 'C', 17: 'Cl', 18: 'F', 19: 'H', 20: 'I', 21: 'N', 22: 'O', 23: 'P', 24: 'S', 25: '[', 26: '\\', 27: ']', 28: 'c', 29: 'n', 30: 'o', 31: 'p', 32: 's'}


We dont have elements: 
.  : $ 7 8 9 Other , where '\\\' is '\'

## smiles list

In [13]:
allsmilesindexlist = []
smileindexlist = []

for char in allcharswithn:
    
    if char != '\n':
        smileindexlist.append(char)
    else:
        allsmilesindexlist.append(smileindexlist)
        smileindexlist = []

In [14]:
len(allsmilesindexlist)

2883246

In [15]:
allsmilesindexlist[0]

['S',
 '(',
 '[',
 'C',
 '@',
 '@',
 'H',
 ']',
 '(',
 'C',
 '(',
 '=',
 'O',
 ')',
 'N',
 '1',
 'C',
 'C',
 'C',
 'C',
 '1',
 ')',
 'C',
 ')',
 'C',
 '1',
 '=',
 'N',
 'C',
 '(',
 '=',
 'O',
 ')',
 'c',
 '2',
 'c',
 '(',
 'N',
 '1',
 ')',
 'c',
 'c',
 'c',
 '(',
 'F',
 ')',
 'c',
 '2']

## Gensim Word2Vec

In [63]:
# sg = 1 -- skip-gram
# sg = 0 -- CBOW

model = Word2Vec(sentences=allsmilesindexlist, size=200, sg=1, window=8, seed=42, workers=8)

In [70]:
model.save("mole2vec-gensim-200.h5")

In [71]:
model = Word2Vec.load("mole2vec-gensim-200.h5") 

In [72]:
model.wv['C']

array([ 3.10500413e-02, -1.09459594e-01, -5.50814718e-02,  8.90447013e-03,
       -8.61630887e-02,  1.05193436e-01,  3.58807035e-02, -1.07865676e-01,
       -1.14267161e-02, -6.43913075e-02,  4.97403964e-02, -2.39011385e-02,
        1.07557058e-01,  4.51775007e-02,  9.14160237e-02, -6.38175458e-02,
        4.69371341e-02,  9.59844422e-03,  4.66431975e-02,  3.24045569e-02,
       -3.30052041e-02, -2.36238614e-02,  9.90164429e-02,  2.32491959e-02,
       -3.46563458e-02,  4.95550111e-02, -3.78869250e-02,  2.47202646e-02,
        6.42825942e-03,  3.54076326e-02,  1.01566939e-02, -7.96375517e-03,
        2.22159065e-02, -3.99433784e-02,  4.20013405e-02, -8.44628960e-02,
       -5.29295951e-02, -4.00129817e-02, -6.43038377e-02, -4.52808961e-02,
        1.71076190e-02,  2.26967279e-02, -5.17846737e-03,  1.42330918e-02,
        4.57747728e-02, -1.50454324e-03, -3.25138494e-02,  2.44415086e-02,
       -5.53570827e-03,  8.26840624e-02,  1.35807684e-02, -9.60453693e-03,
        8.85265768e-02, -

In [73]:
model.wv.most_similar_cosmul(positive=['[', ']'], negative=['('])

[(')', 0.7850743532180786),
 ('C', 0.7666725516319275),
 ('O', 0.7608593702316284),
 ('2', 0.7588022351264954),
 ('Br', 0.7586167454719543),
 ('c', 0.7585274577140808),
 ('o', 0.7574266195297241),
 ('@', 0.7543317079544067),
 ('N', 0.7541368007659912),
 ('n', 0.7470812797546387)]

In [74]:
model.wv.similarity('1', '2')

0.7343347026411112

In [75]:
model.wv.similarity('c', 'o')

0.724086992315808

In [76]:
model.wv.similarity('C', 'O')

0.6215794940273272

In [77]:
model.wv.similarity('+', '-')

0.49879750699818154