# SMILES Level Embedding Molecules

## Importing Libraries

In [34]:
%load_ext autoreload
%autoreload 2
import embeddingMethods.embedding_3d_v2 as e3v2

import SmilesPE
from SmilesPE.pretokenizer import atomwise_tokenizer
import codecs
from SmilesPE.tokenizer import *

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.MolStandardize import rdMolStandardize

import pandas as pd


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
#Defining Directories
Activedatabase_Dir = r'../active_database/Database_v4_2.xlsx'
SPE_ChEMBL_Dir = r'../ChEMBL_dataset/SPE_ChEMBL.txt'

In [3]:
Indole_dataset = e3v2.dataset(Activedatabase_Dir)

Experimental Values


89it [00:00, 6234.02it/s]


Litrature Indole (Benz) Values


142it [00:00, 3119.19it/s]


Litrature Quinilones


14it [00:00, 2394.50it/s]


In [20]:
def load_SPE_dict(dict_dir):
    RAW_index = codecs.open(dict_dir)
    spe = SPE_Tokenizer(RAW_index)
    
    return dict

def switch_keys_values(my_dict):
    return {value: key for key, value in my_dict.items()}

def padd_vecs(vec):
    max_len = max(len(seq) for seq in vec)  
    padded_vec = [seq + [0] * (max_len - len(seq)) for seq in vec] 

def molecule_SMILES_embedding(raw_smiles, dims, vocab_dictionary):

    #SMILE standardization and vocab generation
    standardized_smile = []
    vocab_tokens = []
    for x in raw_smiles:
        mol = Chem.MolFromSmiles(x)
        if mol == None:
            print(x)
        else:
            normalized_mol = rdMolStandardize.Normalize(mol)
            #Converts a SMILE into a list of tokens and merges into dataset
            tokenized_SMILE = spe.tokenize(Chem.MolToSmiles(normalized_mol)).split() #.split() may be nescarry on the tokenizer for output to be list
            for token in tokenized_SMILE:
                vocab_tokens.append(token)


    vocab_tokens = sorted(set(vocab_tokens)) #Sorted set of all tokens in dataset
    vocab_dict = switch_keys_values(dict(enumerate(vocab))) #Makes into dictionary with tokens

    #Generating smiles vector scalar list
    vector_scalar_list = []
    for list1 in token_strings:
        for i in range(len(list1)):
         key = list1[i]
         if key in index:
            list1[i] = index[key]
        vector_scalar_list.append(list1)
    vector_df = pd.DataFrame(vector_scalar_list)


In [12]:
test_smiles = [Chem.MolToSmiles(Chem.AddHs(molecule.rdkit_mol)) for molecule in Indole_dataset.compounds]

In [13]:
test_smiles[0:5]

['[H]/C(=N\\c1c([H])c([H])c([H])c([H])c1[H])c1c([H])n([H])c2c([H])c([H])c([H])c([H])c12',
 '[H]Oc1c([H])c([H])c(/N=C(\\[H])c2c([H])n([H])c3c([H])c([H])c([H])c([H])c23)c([H])c1[H]',
 '[H]/C(=N\\C([H])([H])C([H])([H])C([H])([H])[H])c1c([H])n([H])c2c([H])c([H])c([H])c([H])c12',
 '[H]/C(=N\\c1c([H])c([H])c(OC([H])([H])[H])c([H])c1[H])c1c([H])n([H])c2c([H])c([H])c([H])c([H])c12',
 '[H]/C(=N\\c1c([H])c([H])c(Cl)c([H])c1[H])c1c([H])n([H])c2c([H])c([H])c([H])c([H])c12']

In [5]:
RAW_index = codecs.open(SPE_ChEMBL_Dir)
spe = SPE_Tokenizer(RAW_index)
spe_vocab = codecs.open(SPE_ChEMBL_Dir).read().split("\n")

In [None]:
standardized_smile = []
vocab_tokens = []
for x in test_smiles:
    mol = Chem.MolFromSmiles(x)
    if mol == None:
        print(x)
    else:
        normalized_mol = rdMolStandardize.Normalize(mol)
        standardized_smile.append(Chem.MolToSmiles(normalized_mol))
        #Converts a SMILE into a list of tokens and merges into dataset
        tokenized_SMILE = spe.tokenize(Chem.MolToSmiles(normalized_mol)).split() #.split() may be nescarry on the tokenizer for output to be list
        for token in tokenized_SMILE:
            vocab_tokens.append(token)

In [23]:
vocab_tokens = sorted(set(vocab_tokens)) #Sorted set of all tokens in dataset
vocab_dict = switch_keys_values(dict(enumerate(vocab_tokens))) #Makes into dictionary with tokens

In [35]:
empty_df = []
for x in standardized_smile: 
    tokenizer = spe.tokenize(x).split()
    tokenization = pd.DataFrame({'Tokenized SMILES': [tokenizer]})  
    empty_df.append(tokenization)

In [37]:
empty_df[0]

Unnamed: 0,Tokenized SMILES
0,"[C(, =N/, c1ccccc1), \, c1c[nH], c2ccccc12]"


In [39]:
tokenized_df = pd.concat(empty_df, ignore_index=True)
output3 = pd.concat([raw_smiles_df, tokenized_df], axis=1, ignore_index=False, join='outer')

TypeError: cannot concatenate object of type '<class 'list'>'; only Series and DataFrame objs are valid

In [40]:
tokenized_df

Unnamed: 0,Tokenized SMILES
0,"[C(, =N/, c1ccccc1), \, c1c[nH], c2ccccc12]"
1,"[Oc1ccc(, /N=C/, c2c[nH], c3ccccc23)cc1]"
2,"[CCC, /N=C/, c1c[nH], c2ccccc12]"
3,"[COc1ccc(, /N=C/, c2c[nH], c3ccccc23)cc1]"
4,"[Clc1ccc(, /N=C/, c2c[nH], c3ccccc23)cc1]"
...,...
240,"[COc1c(, N2, CCN, C(C), C2), c(F)cc2, c(=O)c(C..."
241,"[COc1c(, N2, C[C@H]3, CCCN, [C@H]3, C2), c(F)c..."
242,"[CO, /N, =C1, /C, N(, c2nc3c(, cc2, F), c(=O)c..."
243,"[NC1, [C@H]2, CN(, c3nc4c(, cc3, F), c(=O)c(C(..."


In [46]:
token_strings = tokenized_df['Tokenized SMILES']
vocab = sorted(set(token for tokens in token_strings for token in tokens)) 
index3 = dict(enumerate(vocab)) #assigns every value to single scalar
def switch_keys_values(my_dict):
    return {value: key for key, value in my_dict.items()}

index = switch_keys_values(index3)

In [48]:
vector_scalar_list = []
for list1 in tokenized_df['Tokenized SMILES']:
    for i in range(len(list1)):
     key = list1[i]
     if key in index:
        list1[i] = index[key]
    vector_scalar_list.append(list1)
vector_df = pd.DataFrame(vector_scalar_list)

In [52]:
vector_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,53,44,228,179,202.0,259.0,,,,,,,,,,
1,165,23,246,293,,,,,,,,,,,,
2,82,23,202,259,,,,,,,,,,,,
3,99,23,246,293,,,,,,,,,,,,
4,123,23,246,293,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,98,138,87,59,70.0,189.0,183.0,341.0,74.0,196.0,,,,,,
241,98,138,107,85,171.0,70.0,189.0,183.0,341.0,74.0,196.0,,,,,
242,97,20,42,15,136.0,267.0,326.0,125.0,183.0,344.0,71.0,79.0,95.0,,,
243,146,170,96,300,328.0,125.0,183.0,340.0,34.0,9.0,287.0,125.0,105.0,27.0,,


In [47]:
index

{'(': 0,
 '(C': 1,
 '(N': 2,
 '(O)': 3,
 ')cc': 4,
 ')cc1)': 5,
 ')cc2': 6,
 ')cc3': 7,
 ')ccc2': 8,
 '-': 9,
 '-c1ccc(': 10,
 '-c1ccccc1': 11,
 '-c2': 12,
 '-c2ccc(': 13,
 '-n2': 14,
 '/C': 15,
 '/C(=N/': 16,
 '/C=C/': 17,
 '/C=N/N': 18,
 '/C=N\\': 19,
 '/N': 20,
 '/N=C(/': 21,
 '/N=C(\\C)': 22,
 '/N=C/': 23,
 '/N=C\\': 24,
 '/N=N/': 25,
 '1': 26,
 '2': 27,
 '2)': 28,
 '2)cc1': 29,
 '3': 30,
 '3)': 31,
 '3)c1': 32,
 '3)cc1': 33,
 '4': 34,
 '4)': 35,
 '4)cc': 36,
 '4)cc2': 37,
 '4)cc2)': 38,
 '5': 39,
 '5)': 40,
 '=C': 41,
 '=C1': 42,
 '=C\\': 43,
 '=N/': 44,
 '=N/N': 45,
 '=N\\': 46,
 'Br': 47,
 'Br)cc': 48,
 'Br)cc1': 49,
 'C': 50,
 'C#N': 51,
 'C#N)cc': 52,
 'C(': 53,
 'C(=N': 54,
 'C(=O)': 55,
 'C(=O)O)': 56,
 'C(=O)O)cc1)': 57,
 'C(=S)N': 58,
 'C(C)': 59,
 'C(C)=O)': 60,
 'C(F)': 61,
 'C)': 62,
 'C/C(=N\\': 63,
 'C1': 64,
 'C1=O': 65,
 'C1CC1': 66,
 'C1CCN(': 67,
 'C1CN(': 68,
 'C2': 69,
 'C2)': 70,
 'C2CC2)': 71,
 'C3)': 72,
 'C3=O': 73,
 'C3CC3)': 74,
 'C4CC4)': 75,
 'CC(=O)': 7

In [26]:
vocab_tokens

['(',
 '(C',
 '(N',
 '(O)',
 ')cc',
 ')cc1)',
 ')cc2',
 ')cc3',
 ')ccc2',
 '-',
 '-c1ccc(',
 '-c1ccccc1',
 '-c2',
 '-c2ccc(',
 '-n2',
 '/C',
 '/C(=N/',
 '/C=C/',
 '/C=N/N',
 '/C=N\\',
 '/N',
 '/N=C(/',
 '/N=C(\\C)',
 '/N=C/',
 '/N=C\\',
 '/N=N/',
 '1',
 '2',
 '2)',
 '2)cc1',
 '3',
 '3)',
 '3)c1',
 '3)cc1',
 '4',
 '4)',
 '4)cc',
 '4)cc2',
 '4)cc2)',
 '5',
 '5)',
 '=C',
 '=C1',
 '=C\\',
 '=N/',
 '=N/N',
 '=N\\',
 'Br',
 'Br)cc',
 'Br)cc1',
 'C',
 'C#N',
 'C#N)cc',
 'C(',
 'C(=N',
 'C(=O)',
 'C(=O)O)',
 'C(=O)O)cc1)',
 'C(=S)N',
 'C(C)',
 'C(C)=O)',
 'C(F)',
 'C)',
 'C/C(=N\\',
 'C1',
 'C1=O',
 'C1CC1',
 'C1CCN(',
 'C1CN(',
 'C2',
 'C2)',
 'C2CC2)',
 'C3)',
 'C3=O',
 'C3CC3)',
 'C4CC4)',
 'CC(=O)',
 'CC(=O)N',
 'CC(C(=O)O)',
 'CC1',
 'CC3)cc2',
 'CC5)',
 'CCC',
 'CCCC',
 'CCCC2',
 'CCCN',
 'CCCNC(=O)',
 'CCN',
 'CCN(C)CC3)',
 'CCN1',
 'CCNCC3)',
 'CCOC(=O)',
 'CCc1n',
 'CCn1',
 'CCn1cc(',
 'CN',
 'CN(',
 'CO',
 'COc1c(',
 'COc1ccc(',
 'COc1ccc(C(=O)N',
 'COc1ccc(N',
 'COc1ccc2[nH]',
 'COc

In [24]:
vocab_dict

{'(': 0,
 '(C': 1,
 '(N': 2,
 '(O)': 3,
 ')cc': 4,
 ')cc1)': 5,
 ')cc2': 6,
 ')cc3': 7,
 ')ccc2': 8,
 '-': 9,
 '-c1ccc(': 10,
 '-c1ccccc1': 11,
 '-c2': 12,
 '-c2ccc(': 13,
 '-n2': 14,
 '/C': 15,
 '/C(=N/': 16,
 '/C=C/': 17,
 '/C=N/N': 18,
 '/C=N\\': 19,
 '/N': 20,
 '/N=C(/': 21,
 '/N=C(\\C)': 22,
 '/N=C/': 23,
 '/N=C\\': 24,
 '/N=N/': 25,
 '1': 26,
 '2': 27,
 '2)': 28,
 '2)cc1': 29,
 '3': 30,
 '3)': 31,
 '3)c1': 32,
 '3)cc1': 33,
 '4': 34,
 '4)': 35,
 '4)cc': 36,
 '4)cc2': 37,
 '4)cc2)': 38,
 '5': 39,
 '5)': 40,
 '=C': 41,
 '=C1': 42,
 '=C\\': 43,
 '=N/': 44,
 '=N/N': 45,
 '=N\\': 46,
 'Br': 47,
 'Br)cc': 48,
 'Br)cc1': 49,
 'C': 50,
 'C#N': 51,
 'C#N)cc': 52,
 'C(': 53,
 'C(=N': 54,
 'C(=O)': 55,
 'C(=O)O)': 56,
 'C(=O)O)cc1)': 57,
 'C(=S)N': 58,
 'C(C)': 59,
 'C(C)=O)': 60,
 'C(F)': 61,
 'C)': 62,
 'C/C(=N\\': 63,
 'C1': 64,
 'C1=O': 65,
 'C1CC1': 66,
 'C1CCN(': 67,
 'C1CN(': 68,
 'C2': 69,
 'C2)': 70,
 'C2CC2)': 71,
 'C3)': 72,
 'C3=O': 73,
 'C3CC3)': 74,
 'C4CC4)': 75,
 'CC(=O)': 7