In [None]:
import pandas as pd
from IPython.display import Audio
sound_file = '../diving.mp3'
import re
import numpy as np
import pubchempy as pcp
from SmilesPE.pretokenizer import atomwise_tokenizer

In [None]:
shared_alphabet = {'Cl', ')', '3', '8', 'O', 'P', 'F', 'S', '[O-]', 'N', '[N-]', '1', '=', 'Br', '4', '#', '6', '7', 'I', '(', '9', '5', '[N+]', 'C', '2'}
filepath_Lenselink_full_dataset = '../datasets/Lenselink_et_al/Dataset_files_342_MB/data/Supplementary_Information/dataset/compound_additional_physchem_features.txt'

## Getting the ChEMBL IDs from all Lenselink compounds

In [3]:
list_lense_Chemblids = []

with open(filepath_Lenselink_full_dataset) as ds:

    for index, line in enumerate(ds):

        current_search = re.search('^(CHEMBL[0-9]+)\t.+$', line)
        if current_search:
            list_lense_Chemblids.append(current_search.group(1))   
        else:
            continue

In [4]:
# write the list to a file
#with open('../datasets/Lenselink_et_al/Lenselink_Cleaning_A_chemblids.txt', 'w') as Lense_chemblids_file:
#    for id in list_lense_Chemblids:
#        Lense_chemblids_file.write("%s\n" % id)

## Mapping the PubChem IDs to the ChEMBL IDs

In [5]:
unichem = pd.read_table('../datasets/UniChem_Mapping/src1src22.txt')
unichem.rename(columns={"From src:'1'": "chembl_id", "To src:'22'": "pubchem_id"}, inplace=True)
unichem = unichem.astype(dtype={'pubchem_id':'object'})

In [6]:
# turn the list into a DataFrame
df_lense = pd.DataFrame(columns=['lense_chembl_id'])
df_lense['lense_chembl_id'] = list_lense_Chemblids

In [7]:
lense = pd.merge(df_lense, unichem,
                  how='left',
                  left_on='lense_chembl_id', right_on='chembl_id')
lense.drop_duplicates(subset='pubchem_id', keep='first', inplace=True, ignore_index=True)
lense.dropna(axis=0, how='any', inplace=True)
lense.reset_index(drop=True)

Unnamed: 0,lense_chembl_id,chembl_id,pubchem_id
0,CHEMBL2364776,CHEMBL2364776,60162406
1,CHEMBL2365129,CHEMBL2365129,71561420
2,CHEMBL29641,CHEMBL29641,1714
3,CHEMBL1229592,CHEMBL1229592,44607530
4,CHEMBL360190,CHEMBL360190,10396070
...,...,...,...
197363,CHEMBL2172055,CHEMBL2172055,71460740
197364,CHEMBL2146864,CHEMBL2146864,71454646
197365,CHEMBL3304900,CHEMBL3304900,3048938
197366,CHEMBL3274623,CHEMBL3274623,3048936


## Mapping the PubChem SMILES to the PubChem IDs

In [8]:
lense['canonical_smiles'] = ''
for row in lense.index:
    entry = pcp.Compound.from_cid(lense.iat[row, 2])
    lense.iat[row, 3] = entry.canonical_smiles
    
Audio(sound_file, autoplay=True)

KeyboardInterrupt: 

In [25]:
lense.loc[25100:25150]

Unnamed: 0,lense_chembl_id,chembl_id,pubchem_id,canonical_smiles
25100,CHEMBL2334631,CHEMBL2334631,25192356,C1=CC=C(C(=C1)OC2=CC=C(C=C2)C3=NC4=C(N3)C=C(C=...
25101,CHEMBL2334643,CHEMBL2334643,25192509,C1=CC=C(C(=C1)C(=O)C2=CC=C(C=C2)C3=NC4=C(N3)C=...
25102,CHEMBL2334646,CHEMBL2334646,25192510,C1=CC=C(C(=C1)C(=O)C2=CC=C(C=C2)C3=NC4=C(N3)C=...
25103,CHEMBL3127535,CHEMBL3127535,76329166,C1CN(CCC1N2CCC3=C2C=C(C=C3)F)C4=NN=C(C=C4)C5=C...
25104,CHEMBL3127536,CHEMBL3127536,76314705,C1CN(CCC1N2CCC3=C2C=C(C=C3)F)C4=NN=C(C=C4)C5=C...
25105,CHEMBL3127665,CHEMBL3127665,76336462,CN1C=C(C=N1)C2=NN=C(C=C2)N3CCC(CC3)N4CCC5=C4C=...
25106,CHEMBL3127654,CHEMBL3127654,76332842,CC1=CSC(=N1)C2=NN=C(C=C2)N3CCC(CC3)N4CCC5=C4C=...
25107,CHEMBL3093866,CHEMBL3093866,71657335,CC1=CSC(=N1)NC(=O)N2CCC(CC2)N3CCC4=CC=CC=C43
25108,CHEMBL3093868,CHEMBL3093868,71657232,CC1=CSC(=N1)NC(=O)N2CCC(CC2)N3CCC4=C3C=C(C=C4)F
25109,CHEMBL3093865,CHEMBL3093865,71657233,CC1=CSC(=N1)NC(=O)N2CCC(CC2)N3C=CC4=CC=CC=C43


In [None]:
# drop the chembl columns
lense.drop(columns=['lense_chembl_id', 'chembl_id'])

# drop nas
lense.dropna(how='any', inplace=True)
lense.reset_index(drop=True)

## Filter the SMILES for subsets of shared alphabet

In [None]:
# generate the tokens
lense['tokens'] = ''
for row in lense.index:
    lense.at[row, 'tokens'] = atomwise_tokenizer(lense.at[row, 'canonical_smiles'])

In [None]:
for row in lense.index:
    entry = pcp.Compound.from_cid(lense.iat[row, 2])
    lense.iat[row, 3] = entry.canonical_smiles
    
Audio(sound_file, autoplay=True)


shared_alphabet