In [1]:
import pandas as pd

# ! pip install tqdm
# ! pip install molvs
from tqdm import tqdm
from molvs import standardize_smiles

from rdkit import Chem

In [13]:
def MolVStandardiseSmiles(smile_list):
    
    standardised_smiles = []

    for smile in tqdm(smile_list, total=len(smile_list)):
        try:
            # Check if SMILES is valid
            mol = Chem.MolFromSmiles(smile)
            if mol is None:
                raise ValueError(f"Invalid SMILES string: {smile}")

            # Standardize SMILES
            res1 = standardize_smiles(smile)
            res2 = pd.DataFrame(data=[{'original_smiles': smile, 'standardised_smiles': res1}])
            standardised_smiles.append(res2)
        
        except Exception as e:
            print(f"Error processing SMILES string {smile}: {e}")

    if standardised_smiles:
        standardised_smiles_df = pd.concat(standardised_smiles, ignore_index=True)
        return standardised_smiles_df
    else:
        return pd.DataFrame(columns=['original_smiles', 'standardised_smiles'])

In [14]:
chembl_mito_safe = pd.read_csv('../AL00_datasets/chembl_mito_safe_hepato_cardio_tox_mitotox_lit_alerts_removed.csv')

chembl_mito_safe = chembl_mito_safe[chembl_mito_safe['canonical_smiles'].notna()]
chembl_mito_safe['canonical_smiles'] = chembl_mito_safe['canonical_smiles'].astype(str)
chembl_mito_safe

Unnamed: 0,action_type,direct_interaction,disease_efficacy,max_phase,mechanism_of_action,molecule_chembl_id,parent_molecule_chembl_id,target_chembl_id,molecule_structures,pref_name,canonical_smiles
0,ACTIVATOR,1,1,4,Soluble guanylate cyclase activator,CHEMBL6622,CHEMBL6622,CHEMBL2111348,{'canonical_smiles': 'O=[N+]([O-])O[C@H]1CO[C@...,ISOSORBIDE DINITRATE,O=[N+]([O-])O[C@H]1CO[C@H]2[C@@H]1OC[C@H]2O[N+...
1,INHIBITOR,1,1,4,Thiazide-sensitive sodium-chloride cotransport...,CHEMBL406,CHEMBL406,CHEMBL1876,{'canonical_smiles': 'CC1Cc2ccccc2N1NC(=O)c1cc...,INDAPAMIDE,CC1Cc2ccccc2N1NC(=O)c1ccc(Cl)c(S(N)(=O)=O)c1
2,INHIBITOR,1,1,4,DNA inhibitor,CHEMBL416,CHEMBL416,CHEMBL2311221,{'canonical_smiles': 'COc1c2occc2cc2ccc(=O)oc1...,METHOXSALEN,COc1c2occc2cc2ccc(=O)oc12
3,INHIBITOR,1,1,4,Carbonic anhydrase I inhibitor,CHEMBL17,CHEMBL17,CHEMBL261,{'canonical_smiles': 'NS(=O)(=O)c1cc(Cl)c(Cl)c...,DICHLORPHENAMIDE,NS(=O)(=O)c1cc(Cl)c(Cl)c(S(N)(=O)=O)c1
4,INHIBITOR,1,1,4,Carbonic anhydrase inhibitor,CHEMBL18,CHEMBL18,CHEMBL2095180,{'canonical_smiles': 'CCOc1ccc2nc(S(N)(=O)=O)s...,ETHOXZOLAMIDE,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
...,...,...,...,...,...,...,...,...,...,...,...
1042,INHIBITOR,1,1,4,Tyrosine-protein kinase JAK3 inhibitor,CHEMBL5314649,CHEMBL4085457,CHEMBL2148,{'canonical_smiles': 'C=CC(=O)N1C[C@H](Nc2ncnc...,RITLECITINIB TOSYLATE,C=CC(=O)N1C[C@H](Nc2ncnc3[nH]ccc23)CC[C@@H]1C....
1043,INHIBITOR,1,1,4,Tyrosine-protein kinase TYK2 inhibitor,CHEMBL5315119,CHEMBL3622821,CHEMBL3553,{'canonical_smiles': 'CC[C@@H]1CN(C(=O)NCC(F)(...,UPADACITINIB HEMIHYDRATE,CC[C@@H]1CN(C(=O)NCC(F)(F)F)C[C@@H]1c1cnc2cnc3...
1044,INHIBITOR,1,1,4,Ileal bile acid transporter inhibitor,CHEMBL5315120,CHEMBL4297588,CHEMBL2778,{'canonical_smiles': 'CCCCC1(CCCC)CN(c2ccccc2)...,ODEVIXIBAT SESQUIHYDRATE,CCCCC1(CCCC)CN(c2ccccc2)c2cc(SC)c(OCC(=O)N[C@@...
1045,INHIBITOR,1,1,4,Envelope phospholipase OPG057 inhibitor,CHEMBL5315121,CHEMBL1257073,CHEMBL5308522,{'canonical_smiles': 'O.O=C(NN1C(=O)[C@@H]2[C@...,TECOVIRIMAT MONOHYDRATE,O.O=C(NN1C(=O)[C@@H]2[C@@H]3C=C[C@@H]([C@H]4C[...


In [15]:
chembl_mito_safe_smiles = chembl_mito_safe['canonical_smiles'].tolist()

chembl_mito_safe_smiles_s = MolVStandardiseSmiles(chembl_mito_safe_smiles)
chembl_mito_safe_smiles_s

100%|██████████████████████████████████████| 1003/1003 [00:01<00:00, 759.77it/s]


Unnamed: 0,original_smiles,standardised_smiles
0,O=[N+]([O-])O[C@H]1CO[C@H]2[C@@H]1OC[C@H]2O[N+...,O=[N+]([O-])O[C@H]1CO[C@H]2[C@@H]1OC[C@H]2O[N+...
1,CC1Cc2ccccc2N1NC(=O)c1ccc(Cl)c(S(N)(=O)=O)c1,CC1Cc2ccccc2N1NC(=O)c1ccc(Cl)c(S(N)(=O)=O)c1
2,COc1c2occc2cc2ccc(=O)oc12,COc1c2occc2cc2ccc(=O)oc12
3,NS(=O)(=O)c1cc(Cl)c(Cl)c(S(N)(=O)=O)c1,NS(=O)(=O)c1cc(Cl)c(Cl)c(S(N)(=O)=O)c1
4,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
...,...,...
998,C=CC(=O)N1C[C@H](Nc2ncnc3[nH]ccc23)CC[C@@H]1C....,C=CC(=O)N1C[C@H](Nc2ncnc3[nH]ccc23)CC[C@@H]1C....
999,CC[C@@H]1CN(C(=O)NCC(F)(F)F)C[C@@H]1c1cnc2cnc3...,CC[C@@H]1CN(C(=O)NCC(F)(F)F)C[C@@H]1c1cnc2cnc3...
1000,CCCCC1(CCCC)CN(c2ccccc2)c2cc(SC)c(OCC(=O)N[C@@...,CCCCC1(CCCC)CN(c2ccccc2)c2cc(SC)c(OCC(=O)N[C@@...
1001,O.O=C(NN1C(=O)[C@@H]2[C@@H]3C=C[C@@H]([C@H]4C[...,O.O=C(NN1C(=O)[C@@H]2[C@@H]3C=C[C@@H]([C@H]4C[...


In [16]:
chembl_mito_safe_smiles_s = chembl_mito_safe_smiles_s.rename(columns={'original_smiles':'canonical_smiles'})

chembl_mito_safe_standardise = pd.merge(chembl_mito_safe, chembl_mito_safe_smiles_s, on='canonical_smiles', how='outer')

chembl_mito_safe_standardise.drop_duplicates(subset='molecule_chembl_id', keep="first", inplace=True)

chembl_mito_safe_standardise

Unnamed: 0,action_type,direct_interaction,disease_efficacy,max_phase,mechanism_of_action,molecule_chembl_id,parent_molecule_chembl_id,target_chembl_id,molecule_structures,pref_name,canonical_smiles,standardised_smiles
0,ACTIVATOR,1,1,4,Soluble guanylate cyclase activator,CHEMBL6622,CHEMBL6622,CHEMBL2111348,{'canonical_smiles': 'O=[N+]([O-])O[C@H]1CO[C@...,ISOSORBIDE DINITRATE,O=[N+]([O-])O[C@H]1CO[C@H]2[C@@H]1OC[C@H]2O[N+...,O=[N+]([O-])O[C@H]1CO[C@H]2[C@@H]1OC[C@H]2O[N+...
1,INHIBITOR,1,1,4,Thiazide-sensitive sodium-chloride cotransport...,CHEMBL406,CHEMBL406,CHEMBL1876,{'canonical_smiles': 'CC1Cc2ccccc2N1NC(=O)c1cc...,INDAPAMIDE,CC1Cc2ccccc2N1NC(=O)c1ccc(Cl)c(S(N)(=O)=O)c1,CC1Cc2ccccc2N1NC(=O)c1ccc(Cl)c(S(N)(=O)=O)c1
2,INHIBITOR,1,1,4,DNA inhibitor,CHEMBL416,CHEMBL416,CHEMBL2311221,{'canonical_smiles': 'COc1c2occc2cc2ccc(=O)oc1...,METHOXSALEN,COc1c2occc2cc2ccc(=O)oc12,COc1c2occc2cc2ccc(=O)oc12
3,INHIBITOR,1,1,4,Carbonic anhydrase I inhibitor,CHEMBL17,CHEMBL17,CHEMBL261,{'canonical_smiles': 'NS(=O)(=O)c1cc(Cl)c(Cl)c...,DICHLORPHENAMIDE,NS(=O)(=O)c1cc(Cl)c(Cl)c(S(N)(=O)=O)c1,NS(=O)(=O)c1cc(Cl)c(Cl)c(S(N)(=O)=O)c1
4,INHIBITOR,1,1,4,Carbonic anhydrase inhibitor,CHEMBL18,CHEMBL18,CHEMBL2095180,{'canonical_smiles': 'CCOc1ccc2nc(S(N)(=O)=O)s...,ETHOXZOLAMIDE,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
...,...,...,...,...,...,...,...,...,...,...,...,...
998,INHIBITOR,1,1,4,Tyrosine-protein kinase JAK3 inhibitor,CHEMBL5314649,CHEMBL4085457,CHEMBL2148,{'canonical_smiles': 'C=CC(=O)N1C[C@H](Nc2ncnc...,RITLECITINIB TOSYLATE,C=CC(=O)N1C[C@H](Nc2ncnc3[nH]ccc23)CC[C@@H]1C....,C=CC(=O)N1C[C@H](Nc2ncnc3[nH]ccc23)CC[C@@H]1C....
999,INHIBITOR,1,1,4,Tyrosine-protein kinase TYK2 inhibitor,CHEMBL5315119,CHEMBL3622821,CHEMBL3553,{'canonical_smiles': 'CC[C@@H]1CN(C(=O)NCC(F)(...,UPADACITINIB HEMIHYDRATE,CC[C@@H]1CN(C(=O)NCC(F)(F)F)C[C@@H]1c1cnc2cnc3...,CC[C@@H]1CN(C(=O)NCC(F)(F)F)C[C@@H]1c1cnc2cnc3...
1000,INHIBITOR,1,1,4,Ileal bile acid transporter inhibitor,CHEMBL5315120,CHEMBL4297588,CHEMBL2778,{'canonical_smiles': 'CCCCC1(CCCC)CN(c2ccccc2)...,ODEVIXIBAT SESQUIHYDRATE,CCCCC1(CCCC)CN(c2ccccc2)c2cc(SC)c(OCC(=O)N[C@@...,CCCCC1(CCCC)CN(c2ccccc2)c2cc(SC)c(OCC(=O)N[C@@...
1001,INHIBITOR,1,1,4,Envelope phospholipase OPG057 inhibitor,CHEMBL5315121,CHEMBL1257073,CHEMBL5308522,{'canonical_smiles': 'O.O=C(NN1C(=O)[C@@H]2[C@...,TECOVIRIMAT MONOHYDRATE,O.O=C(NN1C(=O)[C@@H]2[C@@H]3C=C[C@@H]([C@H]4C[...,O.O=C(NN1C(=O)[C@@H]2[C@@H]3C=C[C@@H]([C@H]4C[...


In [17]:
#chembl_mito_safe_standardise.to_csv('../AL00_datasets/chembl_mito_safe_all_alerts_removed_smiles_standardised.csv', index=False)

In [18]:
tox21_mmp = pd.read_csv('../AL00_datasets/AID_720637_datatable_all.csv')

mmp_actives = tox21_mmp[tox21_mmp['PUBCHEM_ACTIVITY_OUTCOME']=='Active']

mmp_actives.reset_index(drop=True, inplace=True)

mmp_actives = mmp_actives[mmp_actives['PUBCHEM_EXT_DATASOURCE_SMILES'].notna()]
mmp_actives['PUBCHEM_EXT_DATASOURCE_SMILES'] = mmp_actives['PUBCHEM_EXT_DATASOURCE_SMILES'].astype(str)

mmp_actives

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity Summary,Ratio Activity,...,535 nm Activity,535 nm Potency (uM),535 nm Efficacy (%),590 nm Activity,590 nm Potency (uM),590 nm Efficacy (%),Viability Activity,Viability Potency (uM),Viability Efficacy (%),Sample Source
0,3,144203554.0,9403.0,C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2OC(=O)CCC...,Active,49.0,,,active antagonist,active antagonist,...,inactive,,0,inconclusive antagonist,5.30804,-49.1416,inconclusive,,,NCI
1,7,144203558.0,16043.0,CC(C)(C)C1=C(C=CC(=C1)O)O,Active,47.0,,,active antagonist,active antagonist,...,active agonist,27.0015,33.8497,active antagonist,27.0015,-75.5062,inactive,,0,SigmaAldrich
2,8,144203559.0,2724411.0,CN(C)C1=CC=C(C=C1)C(=C2C=CC(=[N+](C)C)C=C2)C3=...,Active,47.0,,,active antagonist,active antagonist,...,inactive,,0,active antagonist,26.6032,-107.612,inactive,,0,NCI
3,13,144203564.0,79472.0,CC1=CC(=C2C=C(C=CC2=N1)NC(=O)NC3=CC4=C(C=C(N=C...,Active,48.0,,,active antagonist,active antagonist,...,active agonist,28.0578,96.2381,active antagonist,22.095,-51.6939,inactive,,0,Labotest
4,23,144203574.0,65463.0,CC(C)(C)C1=CC=C(C=C1)CN2CCN(CC2)C(C3=CC=CC=C3)...,Active,43.0,,,active antagonist,active antagonist,...,inactive,,0,inconclusive antagonist,29.8493,-62.8369,inactive,,0,NCI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1255,10466,144214029.0,10800.0,CC1=CC(=CC(=C1O)[N+](=O)[O-])[N+](=O)[O-],Active,63.0,,,active antagonist,active antagonist,...,inactive,,0,active antagonist,2.35387,-97.2574,active antagonist,7.35735,-43.8623,LightBiologicals
1256,10473,144214036.0,4115.0,COC1=CC=C(C=C1)C(C2=CC=C(C=C2)OC)C(Cl)(Cl)Cl,Active,48.0,,,active antagonist,active antagonist,...,inactive,,0,active antagonist,30.7293,-94.3934,inactive,,0,LightBiologicals
1257,10476,144214039.0,29393.0,CCCN(CCC)C1=C(C=C(C=C1[N+](=O)[O-])S(=O)(=O)N)...,Active,52.0,,,active antagonist,active antagonist,...,active antagonist,47.2039,-56.1015,active antagonist,13.2469,-107.045,inconclusive antagonist,46.2109,-74.4164,LightBiologicals
1258,10477,144214040.0,5564.0,C1=CC(=C(C=C1Cl)O)OC2=C(C=C(C=C2)Cl)Cl,Active,66.0,,,active antagonist,active antagonist,...,inconclusive agonist,58.3873,120.771,active antagonist,1.38455,-98.2307,inconclusive antagonist,28.1806,-69.7736,SIGMA


In [19]:
mmp_actives_smiles = mmp_actives['PUBCHEM_EXT_DATASOURCE_SMILES'].tolist()

mmp_actives_smiles_s = MolVStandardiseSmiles(mmp_actives_smiles)
mmp_actives_smiles_s

100%|██████████████████████████████████████| 1229/1229 [00:01<00:00, 975.91it/s]


Unnamed: 0,original_smiles,standardised_smiles
0,C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2OC(=O)CCC...,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...
1,CC(C)(C)C1=C(C=CC(=C1)O)O,CC(C)(C)c1cc(O)ccc1O
2,CN(C)C1=CC=C(C=C1)C(=C2C=CC(=[N+](C)C)C=C2)C3=...,CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)c...
3,CC1=CC(=C2C=C(C=CC2=N1)NC(=O)NC3=CC4=C(C=C(N=C...,Cc1cc(N)c2cc(NC(=O)Nc3ccc4nc(C)cc(N)c4c3)ccc2n...
4,CC(C)(C)C1=CC=C(C=C1)CN2CCN(CC2)C(C3=CC=CC=C3)...,CC(C)(C)c1ccc(CN2CCN(C(c3ccccc3)c3ccc(Cl)cc3)C...
...,...,...
1224,CC1=CC(=CC(=C1O)[N+](=O)[O-])[N+](=O)[O-],Cc1cc([N+](=O)[O-])cc([N+](=O)[O-])c1O
1225,COC1=CC=C(C=C1)C(C2=CC=C(C=C2)OC)C(Cl)(Cl)Cl,COc1ccc(C(c2ccc(OC)cc2)C(Cl)(Cl)Cl)cc1
1226,CCCN(CCC)C1=C(C=C(C=C1[N+](=O)[O-])S(=O)(=O)N)...,CCCN(CCC)c1c([N+](=O)[O-])cc(S(N)(=O)=O)cc1[N+...
1227,C1=CC(=C(C=C1Cl)O)OC2=C(C=C(C=C2)Cl)Cl,Oc1cc(Cl)ccc1Oc1ccc(Cl)cc1Cl


In [20]:
mmp_actives_smiles_s = mmp_actives_smiles_s.rename(columns={'original_smiles':'PUBCHEM_EXT_DATASOURCE_SMILES'})

mmp_actives_standardise = pd.merge(mmp_actives, mmp_actives_smiles_s, on='PUBCHEM_EXT_DATASOURCE_SMILES', how='outer')

mmp_actives_standardise.drop_duplicates(subset='PUBCHEM_CID', keep="first", inplace=True)

mmp_actives_standardise

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity Summary,Ratio Activity,...,535 nm Potency (uM),535 nm Efficacy (%),590 nm Activity,590 nm Potency (uM),590 nm Efficacy (%),Viability Activity,Viability Potency (uM),Viability Efficacy (%),Sample Source,standardised_smiles
0,3,144203554.0,9403.0,C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2OC(=O)CCC...,Active,49.0,,,active antagonist,active antagonist,...,,0,inconclusive antagonist,5.30804,-49.1416,inconclusive,,,NCI,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...
1,7,144203558.0,16043.0,CC(C)(C)C1=C(C=CC(=C1)O)O,Active,47.0,,,active antagonist,active antagonist,...,27.0015,33.8497,active antagonist,27.0015,-75.5062,inactive,,0,SigmaAldrich,CC(C)(C)c1cc(O)ccc1O
2,8,144203559.0,2724411.0,CN(C)C1=CC=C(C=C1)C(=C2C=CC(=[N+](C)C)C=C2)C3=...,Active,47.0,,,active antagonist,active antagonist,...,,0,active antagonist,26.6032,-107.612,inactive,,0,NCI,CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)c...
6,13,144203564.0,79472.0,CC1=CC(=C2C=C(C=CC2=N1)NC(=O)NC3=CC4=C(C=C(N=C...,Active,48.0,,,active antagonist,active antagonist,...,28.0578,96.2381,active antagonist,22.095,-51.6939,inactive,,0,Labotest,Cc1cc(N)c2cc(NC(=O)Nc3ccc4nc(C)cc(N)c4c3)ccc2n...
7,23,144203574.0,65463.0,CC(C)(C)C1=CC=C(C=C1)CN2CCN(CC2)C(C3=CC=CC=C3)...,Active,43.0,,,active antagonist,active antagonist,...,,0,inconclusive antagonist,29.8493,-62.8369,inactive,,0,NCI,CC(C)(C)c1ccc(CN2CCN(C(c3ccccc3)c3ccc(Cl)cc3)C...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1724,10448,144214011.0,175967.0,CS(=O)(=O)C1=CC(=C(C=C1)C(=O)C2C(=O)CCCC2=O)[N...,Active,47.0,,,active antagonist,active antagonist,...,,0,active antagonist,15.5268,-78.5351,inactive,,0,LightBiologicals,CS(=O)(=O)c1ccc(C(=O)C2C(=O)CCCC2=O)c([N+](=O)...
1725,10453,144214016.0,3712.0,C1=CC=C2C(=C1)C(=CN2)CO,Active,44.0,,,active antagonist,active antagonist,...,,29.4129,active antagonist,47.1413,-95.9907,inactive,,0,SIGMA,OCc1c[nH]c2ccccc12
1726,10461,144214024.0,31423.0,C1=CC2=C3C(=C1)C=CC4=CC=CC(=C43)C=C2,Active,43.0,,,active antagonist,active antagonist,...,56.3713,55.5849,active antagonist,54.0884,-75.0756,inactive,,0,LightBiologicals,c1cc2ccc3cccc4ccc(c1)c2c34
1727,10473,144214036.0,4115.0,COC1=CC=C(C=C1)C(C2=CC=C(C=C2)OC)C(Cl)(Cl)Cl,Active,48.0,,,active antagonist,active antagonist,...,,0,active antagonist,30.7293,-94.3934,inactive,,0,LightBiologicals,COc1ccc(C(c2ccc(OC)cc2)C(Cl)(Cl)Cl)cc1


In [21]:
#mmp_actives_standardise.to_csv('../AL00_datasets/tox21_mmp_actives_smiles_standardised.csv', index=False)