In [15]:
from rdkit import Chem
import numpy as np

SMILES_CHARS = [' ',
                  '#', '%', '(', ')', '+', '-', '.', '/',
                  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                  '=', '@',
                  'A', 'B', 'C', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P',
                  'R', 'S', 'T', 'V', 'X', 'Z',
                  '[', '\\', ']',
                  'a', 'b', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's',
                  't', 'u']
smi2index = dict( (c,i) for i,c in enumerate( SMILES_CHARS ) )
index2smi = dict( (i,c) for i,c in enumerate( SMILES_CHARS ) )
def smiles_encoder( smiles, maxlen=120 ):
    smiles = Chem.MolToSmiles(Chem.MolFromSmiles( smiles ))
    X = np.zeros( ( maxlen, len( SMILES_CHARS ) ) )
    for i, c in enumerate( smiles ):
        X[i, smi2index[c] ] = 1
    return X
 
def smiles_decoder( X ):
    smi = ''
    X = X.argmax( axis=-1 )
    for i in X:
        smi += index2smi[ i ]
    return smi


In [20]:
mat=smiles_encoder('CC1CCN(CC1N(C)C2=NC=NC3=C2C=CN3)C(=O)CC#N')
print( mat.shape )

dec=smiles_decoder(mat)
print(dec)

(120, 56)
CC1CCN(C(=O)CC#N)CC1N(C)c1ncnc2[nH]ccc12                                                                                


In [19]:
mat2 = smiles_encoder('CC1CCN(C(=O)CC#N)CC1N(C)c1ncnc2[nH]ccc12')
np.argwhere(mat!=mat2)

array([], shape=(0, 2), dtype=int64)

In [21]:
index2smi

{0: ' ',
 1: '#',
 2: '%',
 3: '(',
 4: ')',
 5: '+',
 6: '-',
 7: '.',
 8: '/',
 9: '0',
 10: '1',
 11: '2',
 12: '3',
 13: '4',
 14: '5',
 15: '6',
 16: '7',
 17: '8',
 18: '9',
 19: '=',
 20: '@',
 21: 'A',
 22: 'B',
 23: 'C',
 24: 'F',
 25: 'H',
 26: 'I',
 27: 'K',
 28: 'L',
 29: 'M',
 30: 'N',
 31: 'O',
 32: 'P',
 33: 'R',
 34: 'S',
 35: 'T',
 36: 'V',
 37: 'X',
 38: 'Z',
 39: '[',
 40: '\\',
 41: ']',
 42: 'a',
 43: 'b',
 44: 'c',
 45: 'e',
 46: 'g',
 47: 'i',
 48: 'l',
 49: 'n',
 50: 'o',
 51: 'p',
 52: 'r',
 53: 's',
 54: 't',
 55: 'u'}