In [1]:
import pandas as pd
import rdkit

from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit import RDLogger
from rdkit.Chem.inchi import MolToInchiKey
RDLogger.DisableLog('rdApp.*')

from tqdm import tqdm
from rdkit import Chem, RDLogger
from rdkit.Chem import MolStandardize
from rdkit.Chem.Descriptors import ExactMolWt,MolWt



# 提取五个数据库中的SMILES列表

In [2]:
class MolClean(object):
    def __init__(self):
        self.normizer = MolStandardize.normalize.Normalizer()
        self.lfc = MolStandardize.fragment.LargestFragmentChooser()
        self.uc = MolStandardize.charge.Uncharger()
 
    def clean(self, smi):
        mol = Chem.MolFromSmiles(smi)
        if mol:
#             mol = self.normizer.normalize(mol)
#             mol = self.lfc.choose(mol)
#             mol = self.uc.uncharge(mol)
            # 此处需注意这里的isomericSmiles及canonical这两个参数，后面会说明
            smi = Chem.MolToSmiles(mol,  isomericSmiles =True, canonical=True)
            return smi
        else:
            return None

def standardize_smi(cid, smiles,basicClean=True,clearCharge=True, clearFrag=True, canonTautomer=False, isomeric=False):
    try:
        clean_mol = Chem.MolFromSmiles(smiles)
        # 除去氢、金属原子、标准化分子
        if basicClean:
            clean_mol = rdMolStandardize.Cleanup(clean_mol)
        if clearFrag:
            #  仅保留主要片段作为分子
            clean_mol = rdMolStandardize.FragmentParent(clean_mol)
        # 尝试中性化处理分子
        if clearCharge:
            uncharger = rdMolStandardize.Uncharger() 
            clean_mol = uncharger.uncharge(clean_mol)
        # 处理互变异构情形，这一步在某些情况下可能不够完美
        if canonTautomer:
            te = rdMolStandardize.TautomerEnumerator() # idem
            clean_mol = te.Canonicalize(clean_mol)
        #set to True 保存立体信息，set to False 移除立体信息，并将分子存为标准化后的SMILES形式
        stan_smiles=Chem.MolToSmiles(clean_mol, isomericSmiles=isomeric,canonical=True)
    except Exception as e:
        print (cid, e, smiles)
        return None
    return stan_smiles

def standardize_smi_V1(cid, smiles,isomeric=True):
    try:
        mol = Chem.MolFromSmiles(smiles)
        mol_weight = MolWt(mol)
        numAtom = mol.GetNumAtoms()
        if mol_weight>900 or mol_weight<50:
            # print("{} is too large or too small!,smiles is {}\n".format(cid,smiles))
            return None,None
        if numAtom<=3:
            # print("{} is too small!,smiles is {}\n".format(cid,smiles))
            return None,None
        #set to True 保存立体信息，set to False 移除立体信息，并将分子存为标准化后的SMILES形式
        stan_smiles=Chem.MolToSmiles(mol, isomericSmiles=isomeric,canonical=True)
        inchkey = MolToInchiKey(mol)
    except Exception as e:
        # print(cid, e, smiles)
        return None,None
    return inchkey, stan_smiles

## DrugBank

In [3]:
"提取DrugBank数据"
DrugBank_list = []
DrugBank_df = pd.read_csv("./drugbank_smiles.csv")[["drugbank_id","smiles"]]
DrugBank_df.head()

Unnamed: 0,drugbank_id,smiles
0,DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
1,DB00007,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...
2,DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
3,DB00027,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
4,DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...


In [4]:
for index, row in DrugBank_df.iterrows():
    cid, smiles = row["drugbank_id"], row["smiles"]
    inchkey, stan_smiles = standardize_smi_V1(cid, smiles)
    if stan_smiles:
        DrugBank_list.append([cid,"DrugBank",inchkey,stan_smiles])

In [5]:
print(len(DrugBank_list))

19279


## COCONUT

In [6]:
"提取COCONUT数据"
COCONUT_list = []
with open("./COCONUT.txt","r") as file:
    lines = file.readlines()
    for line in lines:
        smiles, cid = line.strip().split(" ")
        inchkey, stan_smiles = standardize_smi_V1(cid, smiles)
        if stan_smiles:
            COCONUT_list.append([cid,"COCONUT",inchkey,stan_smiles])

In [7]:
print(len(COCONUT_list))

377252


## NPASS

In [8]:
"提取NPASS数据"
NPASS_list = []
with open("./NPASS.txt","r") as file:
    lines = file.readlines()
    for line in lines:
        cid, inch, inchkey, smiles = line.strip().split("	")
        inchkey, stan_smiles = standardize_smi_V1(cid, smiles)
        if stan_smiles:
            NPASS_list.append([cid,"NPASS",inchkey, stan_smiles])

In [9]:
print(len(NPASS_list))

90432


In [10]:
NPASS_list[:10]

[['NPC81384',
  'NPASS',
  'SUHOQUVVVLNYQR-MRVPVSSYSA-N',
  'C[N+](C)(C)CCOP(=O)([O-])OC[C@H](O)CO'],
 ['NPC109991',
  'NPASS',
  'RVOHRYJJXCFNQS-YEJXKQKISA-N',
  'CC[C@H](C)[C@H](C(=O)O)n1cc2c3c(c(-c4ccccc4)ccc3c1=O)C(=O)C(O)=C2'],
 ['NPC4649',
  'NPASS',
  'MHLKYRDXNNGAMX-WLMAVBAVSA-N',
  'C/C=C/C(=O)O[C@H]1[C@@H](OC(C)=O)c2c(ccc3ccc(=O)oc23)OC1(C)C'],
 ['NPC90844',
  'NPASS',
  'NCVWJDISIZHFQS-AWEZNQCLSA-N',
  'COc1ccc2c3c(c4cc(OC)c(OC)cc4c2c1)C[C@@H]1CCCN1C3'],
 ['NPC18064',
  'NPASS',
  'HNJKFVJZKCXLRL-SILQXMKOSA-N',
  'CC1=CC[C@]2(C(=O)O)CC[C@]3(C)[C@H](CC[C@@H]4[C@@]5(C)CC[C@H](O)C(C)(C)[C@@H]5CC[C@]43C)[C@H]2[C@@H]1C'],
 ['NPC212965',
  'NPASS',
  'MYAKCCHNPIRCLY-PSMGVCQDSA-N',
  'C/C(=C\\CC/C(C)=C/CC[C@]1(C)C=Cc2c(O)cc(C)cc2O1)CO'],
 ['NPC81135',
  'NPASS',
  'AAULQOBOWPLXLU-JSGCOSHPSA-N',
  'CC(C)(O)[C@H]1OC2=C(C(=O)c3ccccc3C2=O)[C@@H]1O'],
 ['NPC483076',
  'NPASS',
  'JZXCMDTZJYOYJA-DVXSWTTMSA-N',
  'CC(C)=CC[C@H]1C[C@@]2(C)C(=O)[C@](C(=O)C(C)C)(C(=O)C3=C2O[C@@H](C(C)(C)O)C3

## FooDB

In [11]:
"提取FooDB数据"
import pandas as pd
FooDB_list = []
FooDB_df = pd.read_csv("./FooDB.csv")[["public_id","cas_number"]]
FooDB_df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,public_id,cas_number
0,FDB000004,[H][C@]1(COC(C)=O)O[C@@]([H])(OC2=CC3=C(O)C=C(...
1,FDB000013,[H][C@]1(COC(=O)CCC(O)=O)O[C@@]([H])(OC2=CC3=C...
2,FDB000014,[H][C@]1(COC(=O)CCC(O)=O)O[C@@]([H])(OC2=CC3=C...
3,FDB000024,[H][C@]1(COC(C)=O)OC(OC2=C([O+]=C3C=C(O)C=C(O)...
4,FDB000025,[H][C@]1(COC(C)=O)OC(OC2=C([O+]=C3C=C(O)C=C(O)...


In [12]:
for index, row in FooDB_df.iterrows():
    cid, smiles = row["public_id"], row["cas_number"]
    inchkey, stan_smiles = standardize_smi_V1(cid, smiles)
    if stan_smiles:
        FooDB_list.append([cid,"FooDB",inchkey, stan_smiles])

In [13]:
print(len(FooDB_list))

48008


In [14]:
FooDB_list[-10:]

[['FDB112145',
  'FooDB',
  'FCWNOQOVMGNKNG-WDSKDSINSA-N',
  'N[C@@H](CCC(=O)NC(=O)CC[C@H](N)C(=O)O)C(=O)O'],
 ['FDB112146',
  'FooDB',
  'VKENMLSUUVOQLD-VQVTYTSYSA-N',
  'N[C@@H](CCC(=O)N1C[C@H](O)C[C@H]1C(=O)O)C(=O)O'],
 ['FDB112147',
  'FooDB',
  'PXVCMZCJAUJLJP-YUMQZZPRSA-N',
  'N[C@@H](CCC(=O)N[C@@H](Cc1c[nH]cn1)C(=O)O)C(=O)O'],
 ['FDB112148',
  'FooDB',
  'LNLLNTMHVMIMOG-YUMQZZPRSA-N',
  'NCCCC[C@H](NC(=O)CC[C@H](N)C(=O)O)C(=O)O'],
 ['FDB112149',
  'FooDB',
  'VBCZKAGVUKCANO-BQBZGAKWSA-N',
  'N[C@@H](CCC(=O)N1CCC[C@H]1C(=O)O)C(=O)O'],
 ['FDB112150',
  'FooDB',
  'SQBNIUOYNOKDTI-WHFBIAKZSA-N',
  'N[C@@H](CCC(=O)N[C@@H](CO)C(=O)O)C(=O)O'],
 ['FDB112151',
  'FooDB',
  'GWNXFCYUJXASDX-ZDLURKLDSA-N',
  'C[C@@H](O)[C@H](NC(=O)CC[C@H](N)C(=O)O)C(=O)O'],
 ['FDB112152',
  'FooDB',
  'CATMPQFFVNKDEY-AAEUAGOBSA-N',
  'N[C@@H](CCC(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)O)C(=O)O'],
 ['FDB112153',
  'FooDB',
  'HUSHOYQNDOSYMV-USYZEHPZSA-N',
  'CCCCCCCCCCCCCCCCCC(=O)O[C@H](COC(=O)CCCCCCCCC)COC(=O)CC

## ChEMBL

In [15]:
ChEMBL_list = []
with open("chembl_33_chemreps.txt", "r") as file:
    lines = file.readlines()[1:]
    for line in lines:
        chembl_id, smiles,_,_ = line.strip().split("\t")
        inchkey,canonical_smiles = standardize_smi_V1(chembl_id,smiles)
        if canonical_smiles:
            ChEMBL_list.append([chembl_id,"ChEMBL",inchkey, canonical_smiles])   

In [16]:
print(len(ChEMBL_list))

2310890


In [17]:
ChEMBL_list[-10:]

[['CHEMBL4296955',
  'ChEMBL',
  'YZZVVLUNOZJXCM-DADBAOPHSA-N',
  'CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C@H]3C(=O)C[C@]12C'],
 ['CHEMBL4296956',
  'ChEMBL',
  'JJYIOZSRKCDLFJ-CLFYSBASSA-N',
  'CC(C)=CCC/C(C)=C\\C(N)=O'],
 ['CHEMBL4296957',
  'ChEMBL',
  'NSIFGZRFYPOALF-UHFFFAOYSA-N',
  'CCOC(=O)C(C#N)C(=N)C1C(=O)OC2CCCCC21'],
 ['CHEMBL4298599',
  'ChEMBL',
  'ZVPPSACGDVPYQM-UHFFFAOYSA-N',
  'Br.Br.OCCCC1=CN(Cc2cccc(CN3C=C(CCCO)NC3)n2)CN1'],
 ['CHEMBL4298667',
  'ChEMBL',
  'VBZPYKANSUFIJK-UHFFFAOYSA-N',
  'F[PH](F)(F)(F)(F)F.N=c1ccc2c(-c3ccccc3)c3ccc(N)cc3sc-2c1'],
 ['CHEMBL4298695',
  'ChEMBL',
  'YCZQOKOQNQLIFH-UHFFFAOYSA-N',
  'CCCCCCCCCCCCCCCCCCPCCCCCCCCCCCCCC.O=S(=O)(NS(=O)(=O)C(F)(F)F)C(F)(F)F'],
 ['CHEMBL4298696',
  'ChEMBL',
  'VHZWZFZSRDDRHX-UHFFFAOYSA-N',
  'CCCCCCCCCCCCCCCCCCPCCCCCCCCCCCCCC.F[PH](F)(F)(F)(F)F'],
 ['CHEMBL4298698',
  'ChEMBL',
  'OTIKKVINVWNBOQ-LDJOHHLFSA-N',
  'C[n+]1cn([C@@H]2O[C@H](CO[P@@](=O)(S)OP(=O)([O-])OP(=O)(O)O)[C@@H](O)[C@H]2O)c2nc(N)n

# 去重与合并

In [18]:
import pandas as pd
DrugBank_pd = pd.DataFrame(DrugBank_list,columns=["ID","Database","Inchkey","SMILES"])
DrugBank_pd = DrugBank_pd.drop_duplicates(subset =["SMILES"],keep="first")
DrugBank_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11043 entries, 0 to 19278
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        11043 non-null  object
 1   Database  11043 non-null  object
 2   Inchkey   11043 non-null  object
 3   SMILES    11043 non-null  object
dtypes: object(4)
memory usage: 431.4+ KB


In [19]:
COCONUT_pd = pd.DataFrame(COCONUT_list,columns=["ID","Database","Inchkey","SMILES"])
COCONUT_pd = COCONUT_pd.drop_duplicates(subset =["SMILES"],keep="first")
COCONUT_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 377252 entries, 0 to 377251
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   ID        377252 non-null  object
 1   Database  377252 non-null  object
 2   Inchkey   377252 non-null  object
 3   SMILES    377252 non-null  object
dtypes: object(4)
memory usage: 14.4+ MB


In [20]:
NPASS_pd = pd.DataFrame(NPASS_list,columns=["ID","Database","Inchkey","SMILES"])
NPASS_pd = NPASS_pd.drop_duplicates(subset =["SMILES"],keep="first")
NPASS_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89649 entries, 0 to 90429
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        89649 non-null  object
 1   Database  89649 non-null  object
 2   Inchkey   89649 non-null  object
 3   SMILES    89649 non-null  object
dtypes: object(4)
memory usage: 3.4+ MB


In [21]:
FooDB_pd = pd.DataFrame(FooDB_list,columns=["ID","Database","Inchkey","SMILES"])
FooDB_pd = FooDB_pd.drop_duplicates(subset =["SMILES"],keep="first")
FooDB_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47222 entries, 0 to 48007
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        47222 non-null  object
 1   Database  47222 non-null  object
 2   Inchkey   47222 non-null  object
 3   SMILES    47222 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


In [22]:
ChEMBL_pd = pd.DataFrame(ChEMBL_list,columns=["ID","Database","Inchkey","SMILES"])
ChEMBL_pd = ChEMBL_pd.drop_duplicates(subset =["SMILES"],keep="first")
ChEMBL_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2310747 entries, 0 to 2310889
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   ID        object
 1   Database  object
 2   Inchkey   object
 3   SMILES    object
dtypes: object(4)
memory usage: 88.1+ MB


In [23]:
all_pd =pd.concat([DrugBank_pd,COCONUT_pd,NPASS_pd,FooDB_pd,ChEMBL_pd], axis = 0, ignore_index = True)
all_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2835913 entries, 0 to 2835912
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   ID        object
 1   Database  object
 2   Inchkey   object
 3   SMILES    object
dtypes: object(4)
memory usage: 86.5+ MB


In [24]:
all_pd = all_pd.drop_duplicates(subset =["SMILES"],keep="first")
all_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2743637 entries, 0 to 2835912
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   ID        object
 1   Database  object
 2   Inchkey   object
 3   SMILES    object
dtypes: object(4)
memory usage: 104.7+ MB


In [25]:
all_pd = all_pd.reset_index(drop=True)
all_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2743637 entries, 0 to 2743636
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   ID        object
 1   Database  object
 2   Inchkey   object
 3   SMILES    object
dtypes: object(4)
memory usage: 83.7+ MB


In [26]:
all_pd

Unnamed: 0,ID,Database,Inchkey,SMILES
0,DB00114,DrugBank,NGVDGCNFYWLIFO-UHFFFAOYSA-N,Cc1ncc(COP(=O)(O)O)c(C=O)c1O
1,DB00116,DrugBank,MSTNYGQPCMXVAQ-KIYNQFGBSA-N,Nc1nc(=O)c2c([nH]1)NCC(CNc1ccc(C(=O)N[C@@H](CC...
2,DB00117,DrugBank,HNDVDQJCIGZPNO-YFKPBYRVSA-N,N[C@@H](Cc1c[nH]cn1)C(=O)O
3,DB00118,DrugBank,MEFKEPWMEQBLKI-AIRLBKTGSA-N,C[S+](CC[C@H](N)C(=O)[O-])C[C@H]1O[C@@H](n2cnc...
4,DB00119,DrugBank,LCTONWCANYUPML-UHFFFAOYSA-N,CC(=O)C(=O)O
...,...,...,...,...
2743632,CHEMBL4298695,ChEMBL,YCZQOKOQNQLIFH-UHFFFAOYSA-N,CCCCCCCCCCCCCCCCCCPCCCCCCCCCCCCCC.O=S(=O)(NS(=...
2743633,CHEMBL4298696,ChEMBL,VHZWZFZSRDDRHX-UHFFFAOYSA-N,CCCCCCCCCCCCCCCCCCPCCCCCCCCCCCCCC.F[PH](F)(F)(...
2743634,CHEMBL4298698,ChEMBL,OTIKKVINVWNBOQ-LDJOHHLFSA-N,C[n+]1cn([C@@H]2O[C@H](CO[P@@](=O)(S)OP(=O)([O...
2743635,CHEMBL4298702,ChEMBL,NZIGZXNUFVMHNV-UHFFFAOYSA-N,c1ccc(C2CC(C3CC(c4ccccc4)OC(c4ccccc4)C3)CC(c3c...


In [27]:
"""
print("去重前:{}".format(len(FooDB_list)+len(NPASS_list)+len(COCONUT_list)+len(DrugBank_list)))
smiles_dict = {}
for i in FooDB_list:
    smiles_dict[i[2]] = i[0]
for i in NPASS_list:
    smiles_dict[i[2]] = i[0]
for i in COCONUT_list:
    smiles_dict[i[2]] = i[0]
for i in DrugBank_list:
    smiles_dict[i[2]] = i[0]
print("去重后:{}".format(len(smiles_dict)))
"""

'\nprint("去重前:{}".format(len(FooDB_list)+len(NPASS_list)+len(COCONUT_list)+len(DrugBank_list)))\nsmiles_dict = {}\nfor i in FooDB_list:\n    smiles_dict[i[2]] = i[0]\nfor i in NPASS_list:\n    smiles_dict[i[2]] = i[0]\nfor i in COCONUT_list:\n    smiles_dict[i[2]] = i[0]\nfor i in DrugBank_list:\n    smiles_dict[i[2]] = i[0]\nprint("去重后:{}".format(len(smiles_dict)))\n'

# 保存

In [28]:
all_pd.to_csv("candidate_compounds.csv")

In [29]:
"""
with open("candidate compounds.txt",'w') as file:
    for smiles, cid in smiles_dict.items():
        file.writelines("{} {}\n".format(cid, smiles))
"""

'\nwith open("candidate compounds.txt",\'w\') as file:\n    for smiles, cid in smiles_dict.items():\n        file.writelines("{} {}\n".format(cid, smiles))\n'