In [8]:
import pandas as pd
from tqdm import tqdm

import pubchempy as pcp
import urllib
from urllib.request import urlopen
from pubchempy import PubChemHTTPError
import time

import requests

In [9]:
# Generate chemical name from CID

def NameFromCID(cid):
    max_retries = 10
    retries = 0
    while retries < max_retries:
        try:
            compounds = pcp.get_compounds(cid, "cid")
            compound = compounds[0]
            
            names = compound.synonyms
            name = names[0]
            
            chemical_list = {"chemical name": [name], "PUBCHEM_CID": [cid]}
            chemical_df = pd.DataFrame.from_dict(chemical_list)
            return chemical_df
        
        except (IndexError, AttributeError, PubChemHTTPError, urllib.error.URLError):
            retries += 1
            time.sleep(5)
            return None

In [10]:
def InChIKeyFromSMILES(smiles):
  max_retries = 20
  retries = 0
  while retries < max_retries:
    try:
      properties = ["InChIKey"]
      compounds = pcp.get_properties(properties, smiles, "smiles")
      compounds[0].update({"standardised_smiles": smiles})  
      prop_df = pd.DataFrame(compounds)
      return(prop_df)
   
  
    except (IndexError, AttributeError, PubChemHTTPError, requests.exceptions.Timeout, requests.exceptions.ConnectionError, requests.exceptions.RequestException) as e:
      retries += 1
      print(f"Exception caught: {e}")
      if retries < max_retries:
        print(f"Retrying in 3 seconds...")
        time.sleep(3)
    
      return None

# Subset Tox21 MMP data into just standardised smiles, source and mitochondrial toxicity + Generate Names and InChIKeys

In [11]:
tox21_mmp = pd.read_csv("../AL00_datasets/tox21_mmp_actives_smiles_standardised.csv")
tox21_mmp

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity Summary,Ratio Activity,...,535 nm Potency (uM),535 nm Efficacy (%),590 nm Activity,590 nm Potency (uM),590 nm Efficacy (%),Viability Activity,Viability Potency (uM),Viability Efficacy (%),Sample Source,standardised_smiles
0,3,144203554.0,9403.0,C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2OC(=O)CCC...,Active,49.0,,,active antagonist,active antagonist,...,,0.0000,inconclusive antagonist,5.30804,-49.1416,inconclusive,,,NCI,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...
1,7,144203558.0,16043.0,CC(C)(C)C1=C(C=CC(=C1)O)O,Active,47.0,,,active antagonist,active antagonist,...,27.0015,33.8497,active antagonist,27.00150,-75.5062,inactive,,0.0000,SigmaAldrich,CC(C)(C)c1cc(O)ccc1O
2,8,144203559.0,2724411.0,CN(C)C1=CC=C(C=C1)C(=C2C=CC(=[N+](C)C)C=C2)C3=...,Active,47.0,,,active antagonist,active antagonist,...,,0.0000,active antagonist,26.60320,-107.6120,inactive,,0.0000,NCI,CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)c...
3,13,144203564.0,79472.0,CC1=CC(=C2C=C(C=CC2=N1)NC(=O)NC3=CC4=C(C=C(N=C...,Active,48.0,,,active antagonist,active antagonist,...,28.0578,96.2381,active antagonist,22.09500,-51.6939,inactive,,0.0000,Labotest,Cc1cc(N)c2cc(NC(=O)Nc3ccc4nc(C)cc(N)c4c3)ccc2n...
4,23,144203574.0,65463.0,CC(C)(C)C1=CC=C(C=C1)CN2CCN(CC2)C(C3=CC=CC=C3)...,Active,43.0,,,active antagonist,active antagonist,...,,0.0000,inconclusive antagonist,29.84930,-62.8369,inactive,,0.0000,NCI,CC(C)(C)c1ccc(CN2CCN(C(c3ccccc3)c3ccc(Cl)cc3)C...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1004,10448,144214011.0,175967.0,CS(=O)(=O)C1=CC(=C(C=C1)C(=O)C2C(=O)CCCC2=O)[N...,Active,47.0,,,active antagonist,active antagonist,...,,0.0000,active antagonist,15.52680,-78.5351,inactive,,0.0000,LightBiologicals,CS(=O)(=O)c1ccc(C(=O)C2C(=O)CCCC2=O)c([N+](=O)...
1005,10453,144214016.0,3712.0,C1=CC=C2C(=C1)C(=CN2)CO,Active,44.0,,,active antagonist,active antagonist,...,,29.4129,active antagonist,47.14130,-95.9907,inactive,,0.0000,SIGMA,OCc1c[nH]c2ccccc12
1006,10461,144214024.0,31423.0,C1=CC2=C3C(=C1)C=CC4=CC=CC(=C43)C=C2,Active,43.0,,,active antagonist,active antagonist,...,56.3713,55.5849,active antagonist,54.08840,-75.0756,inactive,,0.0000,LightBiologicals,c1cc2ccc3cccc4ccc(c1)c2c34
1007,10473,144214036.0,4115.0,COC1=CC=C(C=C1)C(C2=CC=C(C=C2)OC)C(Cl)(Cl)Cl,Active,48.0,,,active antagonist,active antagonist,...,,0.0000,active antagonist,30.72930,-94.3934,inactive,,0.0000,LightBiologicals,COc1ccc(C(c2ccc(OC)cc2)C(Cl)(Cl)Cl)cc1


In [12]:
mmp_toxic = tox21_mmp[['standardised_smiles', 'PUBCHEM_CID']]
mmp_toxic['source'] = "tox21_mmp"
mmp_toxic['mitochondrial toxic'] = "toxic"
mmp_toxic = mmp_toxic.rename(columns={'PUBCHEM_CID':'CID'})
mmp_toxic

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mmp_toxic['source'] = "tox21_mmp"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mmp_toxic['mitochondrial toxic'] = "toxic"


Unnamed: 0,standardised_smiles,CID,source,mitochondrial toxic
0,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,9403.0,tox21_mmp,toxic
1,CC(C)(C)c1cc(O)ccc1O,16043.0,tox21_mmp,toxic
2,CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)c...,2724411.0,tox21_mmp,toxic
3,Cc1cc(N)c2cc(NC(=O)Nc3ccc4nc(C)cc(N)c4c3)ccc2n...,79472.0,tox21_mmp,toxic
4,CC(C)(C)c1ccc(CN2CCN(C(c3ccccc3)c3ccc(Cl)cc3)C...,65463.0,tox21_mmp,toxic
...,...,...,...,...
1004,CS(=O)(=O)c1ccc(C(=O)C2C(=O)CCCC2=O)c([N+](=O)...,175967.0,tox21_mmp,toxic
1005,OCc1c[nH]c2ccccc12,3712.0,tox21_mmp,toxic
1006,c1cc2ccc3cccc4ccc(c1)c2c34,31423.0,tox21_mmp,toxic
1007,COc1ccc(C(c2ccc(OC)cc2)C(Cl)(Cl)Cl)cc1,4115.0,tox21_mmp,toxic


In [13]:
# Ensure 'CID' column values are integers

mmp_toxic["CID"] = pd.to_numeric(mmp_toxic["CID"], errors='coerce')
mmp_toxic.dropna(subset=["CID"], inplace=True)
mmp_toxic["CID"] = mmp_toxic["CID"].astype(int)

mmp_cids = mmp_toxic["CID"].tolist()

mmp_names = []

for cids in tqdm(mmp_cids, total=len(mmp_cids)):
  res = NameFromCID(cids)
  res2 = pd.DataFrame(res)
  mmp_names.append(res2)

mmp_names_df = pd.concat(mmp_names, ignore_index=True)
mmp_names_df

100%|███████████████████████████████████████| 1009/1009 [18:23<00:00,  1.09s/it]


Unnamed: 0,chemical name,PUBCHEM_CID
0,estradiol cypionate,9403
1,tert-Butylhydroquinone,16043
2,Malachite Green Oxalate,2724411
3,5424-37-3,79472
4,Buclizine dihydrochloride,65463
...,...,...
1004,Mesotrione,175967
1005,Indole-3-carbinol,3712
1006,PYRENE,31423
1007,methoxychlor,4115


In [14]:
mmp_names_df = mmp_names_df.rename(columns={'PUBCHEM_CID':'CID'})
mmp_names_df['chemical name'] = mmp_names_df['chemical name'].str.upper()

mmp_w_names = pd.merge(mmp_toxic, mmp_names_df, on='CID')
mmp_w_names

Unnamed: 0,standardised_smiles,CID,source,mitochondrial toxic,chemical name
0,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,9403,tox21_mmp,toxic,ESTRADIOL CYPIONATE
1,CC(C)(C)c1cc(O)ccc1O,16043,tox21_mmp,toxic,TERT-BUTYLHYDROQUINONE
2,CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)c...,2724411,tox21_mmp,toxic,MALACHITE GREEN OXALATE
3,Cc1cc(N)c2cc(NC(=O)Nc3ccc4nc(C)cc(N)c4c3)ccc2n...,79472,tox21_mmp,toxic,5424-37-3
4,CC(C)(C)c1ccc(CN2CCN(C(c3ccccc3)c3ccc(Cl)cc3)C...,65463,tox21_mmp,toxic,BUCLIZINE DIHYDROCHLORIDE
...,...,...,...,...,...
1004,CS(=O)(=O)c1ccc(C(=O)C2C(=O)CCCC2=O)c([N+](=O)...,175967,tox21_mmp,toxic,MESOTRIONE
1005,OCc1c[nH]c2ccccc12,3712,tox21_mmp,toxic,INDOLE-3-CARBINOL
1006,c1cc2ccc3cccc4ccc(c1)c2c34,31423,tox21_mmp,toxic,PYRENE
1007,COc1ccc(C(c2ccc(OC)cc2)C(Cl)(Cl)Cl)cc1,4115,tox21_mmp,toxic,METHOXYCHLOR


In [15]:
mmp_smiles = mmp_toxic["standardised_smiles"].tolist()

mmp_inchikey = []

for s in tqdm(mmp_smiles, total=len(mmp_smiles)):
  res = InChIKeyFromSMILES(s)
  res2 = pd.DataFrame(res)
  mmp_inchikey.append(res2)

mmp_inchikey_df = pd.concat(mmp_inchikey, ignore_index=True)
mmp_inchikey_df

 91%|████████████████████████████████████▍   | 918/1009 [10:54<00:59,  1.54it/s]

Exception caught: 'PUGREST.BadRequest: error: '
Retrying in 3 seconds...


100%|███████████████████████████████████████| 1009/1009 [12:03<00:00,  1.39it/s]


Unnamed: 0,CID,InChIKey,standardised_smiles
0,9403,UOACKFBJUYNSLK-XRKIENNPSA-N,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...
1,16043,BGNXCDMCOKJUMV-UHFFFAOYSA-N,CC(C)(C)c1cc(O)ccc1O
2,2724411,CNYGFPPAGUCRIC-UHFFFAOYSA-L,CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)c...
3,79472,GKQYGRWQCWWSHN-UHFFFAOYSA-N,Cc1cc(N)c2cc(NC(=O)Nc3ccc4nc(C)cc(N)c4c3)ccc2n...
4,65463,SDBHDSZKNVDKNU-UHFFFAOYSA-N,CC(C)(C)c1ccc(CN2CCN(C(c3ccccc3)c3ccc(Cl)cc3)C...
...,...,...,...
1003,175967,KPUREKXXPHOJQT-UHFFFAOYSA-N,CS(=O)(=O)c1ccc(C(=O)C2C(=O)CCCC2=O)c([N+](=O)...
1004,3712,IVYPNXXAYMYVSP-UHFFFAOYSA-N,OCc1c[nH]c2ccccc12
1005,31423,BBEAQIROQSPTKN-UHFFFAOYSA-N,c1cc2ccc3cccc4ccc(c1)c2c34
1006,4115,IAKOZHOLGAGEJT-UHFFFAOYSA-N,COc1ccc(C(c2ccc(OC)cc2)C(Cl)(Cl)Cl)cc1


In [16]:
mmp_inchikey_df = mmp_inchikey_df.drop('CID', axis=1)

mmp_toxic_filt = pd.merge(mmp_w_names, mmp_inchikey_df, on='standardised_smiles')
mmp_toxic_filt = mmp_toxic_filt[['chemical name', 'InChIKey', 'standardised_smiles', 'source', 'mitochondrial toxic']]
mmp_toxic_filt

Unnamed: 0,chemical name,InChIKey,standardised_smiles,source,mitochondrial toxic
0,ESTRADIOL CYPIONATE,UOACKFBJUYNSLK-XRKIENNPSA-N,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,tox21_mmp,toxic
1,TERT-BUTYLHYDROQUINONE,BGNXCDMCOKJUMV-UHFFFAOYSA-N,CC(C)(C)c1cc(O)ccc1O,tox21_mmp,toxic
2,MALACHITE GREEN OXALATE,CNYGFPPAGUCRIC-UHFFFAOYSA-L,CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)c...,tox21_mmp,toxic
3,5424-37-3,GKQYGRWQCWWSHN-UHFFFAOYSA-N,Cc1cc(N)c2cc(NC(=O)Nc3ccc4nc(C)cc(N)c4c3)ccc2n...,tox21_mmp,toxic
4,BUCLIZINE DIHYDROCHLORIDE,SDBHDSZKNVDKNU-UHFFFAOYSA-N,CC(C)(C)c1ccc(CN2CCN(C(c3ccccc3)c3ccc(Cl)cc3)C...,tox21_mmp,toxic
...,...,...,...,...,...
1003,MESOTRIONE,KPUREKXXPHOJQT-UHFFFAOYSA-N,CS(=O)(=O)c1ccc(C(=O)C2C(=O)CCCC2=O)c([N+](=O)...,tox21_mmp,toxic
1004,INDOLE-3-CARBINOL,IVYPNXXAYMYVSP-UHFFFAOYSA-N,OCc1c[nH]c2ccccc12,tox21_mmp,toxic
1005,PYRENE,BBEAQIROQSPTKN-UHFFFAOYSA-N,c1cc2ccc3cccc4ccc(c1)c2c34,tox21_mmp,toxic
1006,METHOXYCHLOR,IAKOZHOLGAGEJT-UHFFFAOYSA-N,COc1ccc(C(c2ccc(OC)cc2)C(Cl)(Cl)Cl)cc1,tox21_mmp,toxic


In [17]:
#mmp_toxic_filt.to_csv('../AL00_datasets/tox21_mmp_toxic_filt_before_join.csv', index=False)

# ChEMBL Mito Safe Drugs - subset + generate InChIKey

In [18]:
chembl_mito_safe = pd.read_csv("../AL00_datasets/chembl_mito_safe_all_alerts_removed_smiles_standardised.csv")
chembl_mito_safe

Unnamed: 0,action_type,direct_interaction,disease_efficacy,max_phase,mechanism_of_action,molecule_chembl_id,parent_molecule_chembl_id,target_chembl_id,molecule_structures,pref_name,canonical_smiles,standardised_smiles
0,ACTIVATOR,1,1,4,Soluble guanylate cyclase activator,CHEMBL6622,CHEMBL6622,CHEMBL2111348,{'canonical_smiles': 'O=[N+]([O-])O[C@H]1CO[C@...,ISOSORBIDE DINITRATE,O=[N+]([O-])O[C@H]1CO[C@H]2[C@@H]1OC[C@H]2O[N+...,O=[N+]([O-])O[C@H]1CO[C@H]2[C@@H]1OC[C@H]2O[N+...
1,INHIBITOR,1,1,4,Thiazide-sensitive sodium-chloride cotransport...,CHEMBL406,CHEMBL406,CHEMBL1876,{'canonical_smiles': 'CC1Cc2ccccc2N1NC(=O)c1cc...,INDAPAMIDE,CC1Cc2ccccc2N1NC(=O)c1ccc(Cl)c(S(N)(=O)=O)c1,CC1Cc2ccccc2N1NC(=O)c1ccc(Cl)c(S(N)(=O)=O)c1
2,INHIBITOR,1,1,4,DNA inhibitor,CHEMBL416,CHEMBL416,CHEMBL2311221,{'canonical_smiles': 'COc1c2occc2cc2ccc(=O)oc1...,METHOXSALEN,COc1c2occc2cc2ccc(=O)oc12,COc1c2occc2cc2ccc(=O)oc12
3,INHIBITOR,1,1,4,Carbonic anhydrase I inhibitor,CHEMBL17,CHEMBL17,CHEMBL261,{'canonical_smiles': 'NS(=O)(=O)c1cc(Cl)c(Cl)c...,DICHLORPHENAMIDE,NS(=O)(=O)c1cc(Cl)c(Cl)c(S(N)(=O)=O)c1,NS(=O)(=O)c1cc(Cl)c(Cl)c(S(N)(=O)=O)c1
4,INHIBITOR,1,1,4,Carbonic anhydrase inhibitor,CHEMBL18,CHEMBL18,CHEMBL2095180,{'canonical_smiles': 'CCOc1ccc2nc(S(N)(=O)=O)s...,ETHOXZOLAMIDE,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
...,...,...,...,...,...,...,...,...,...,...,...,...
998,INHIBITOR,1,1,4,Tyrosine-protein kinase JAK3 inhibitor,CHEMBL5314649,CHEMBL4085457,CHEMBL2148,{'canonical_smiles': 'C=CC(=O)N1C[C@H](Nc2ncnc...,RITLECITINIB TOSYLATE,C=CC(=O)N1C[C@H](Nc2ncnc3[nH]ccc23)CC[C@@H]1C....,C=CC(=O)N1C[C@H](Nc2ncnc3[nH]ccc23)CC[C@@H]1C....
999,INHIBITOR,1,1,4,Tyrosine-protein kinase TYK2 inhibitor,CHEMBL5315119,CHEMBL3622821,CHEMBL3553,{'canonical_smiles': 'CC[C@@H]1CN(C(=O)NCC(F)(...,UPADACITINIB HEMIHYDRATE,CC[C@@H]1CN(C(=O)NCC(F)(F)F)C[C@@H]1c1cnc2cnc3...,CC[C@@H]1CN(C(=O)NCC(F)(F)F)C[C@@H]1c1cnc2cnc3...
1000,INHIBITOR,1,1,4,Ileal bile acid transporter inhibitor,CHEMBL5315120,CHEMBL4297588,CHEMBL2778,{'canonical_smiles': 'CCCCC1(CCCC)CN(c2ccccc2)...,ODEVIXIBAT SESQUIHYDRATE,CCCCC1(CCCC)CN(c2ccccc2)c2cc(SC)c(OCC(=O)N[C@@...,CCCCC1(CCCC)CN(c2ccccc2)c2cc(SC)c(OCC(=O)N[C@@...
1001,INHIBITOR,1,1,4,Envelope phospholipase OPG057 inhibitor,CHEMBL5315121,CHEMBL1257073,CHEMBL5308522,{'canonical_smiles': 'O.O=C(NN1C(=O)[C@@H]2[C@...,TECOVIRIMAT MONOHYDRATE,O.O=C(NN1C(=O)[C@@H]2[C@@H]3C=C[C@@H]([C@H]4C[...,O.O=C(NN1C(=O)[C@@H]2[C@@H]3C=C[C@@H]([C@H]4C[...


In [19]:
chembl_mito_safe_filt = chembl_mito_safe[['pref_name', 'standardised_smiles']]
chembl_mito_safe_filt['source'] = "chembl_mito_safe_drugs"
chembl_mito_safe_filt['mitochondrial toxic'] = "non-toxic"
chembl_mito_safe_filt

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chembl_mito_safe_filt['source'] = "chembl_mito_safe_drugs"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chembl_mito_safe_filt['mitochondrial toxic'] = "non-toxic"


Unnamed: 0,pref_name,standardised_smiles,source,mitochondrial toxic
0,ISOSORBIDE DINITRATE,O=[N+]([O-])O[C@H]1CO[C@H]2[C@@H]1OC[C@H]2O[N+...,chembl_mito_safe_drugs,non-toxic
1,INDAPAMIDE,CC1Cc2ccccc2N1NC(=O)c1ccc(Cl)c(S(N)(=O)=O)c1,chembl_mito_safe_drugs,non-toxic
2,METHOXSALEN,COc1c2occc2cc2ccc(=O)oc12,chembl_mito_safe_drugs,non-toxic
3,DICHLORPHENAMIDE,NS(=O)(=O)c1cc(Cl)c(Cl)c(S(N)(=O)=O)c1,chembl_mito_safe_drugs,non-toxic
4,ETHOXZOLAMIDE,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,chembl_mito_safe_drugs,non-toxic
...,...,...,...,...
998,RITLECITINIB TOSYLATE,C=CC(=O)N1C[C@H](Nc2ncnc3[nH]ccc23)CC[C@@H]1C....,chembl_mito_safe_drugs,non-toxic
999,UPADACITINIB HEMIHYDRATE,CC[C@@H]1CN(C(=O)NCC(F)(F)F)C[C@@H]1c1cnc2cnc3...,chembl_mito_safe_drugs,non-toxic
1000,ODEVIXIBAT SESQUIHYDRATE,CCCCC1(CCCC)CN(c2ccccc2)c2cc(SC)c(OCC(=O)N[C@@...,chembl_mito_safe_drugs,non-toxic
1001,TECOVIRIMAT MONOHYDRATE,O.O=C(NN1C(=O)[C@@H]2[C@@H]3C=C[C@@H]([C@H]4C[...,chembl_mito_safe_drugs,non-toxic


In [20]:
chembl_smiles = chembl_mito_safe_filt["standardised_smiles"].tolist()

chembl_inchikey = []

for sm in tqdm(chembl_smiles, total=len(chembl_smiles)):
  res = InChIKeyFromSMILES(sm)
  res2 = pd.DataFrame(res)
  chembl_inchikey.append(res2)

chembl_inchikey_df = pd.concat(chembl_inchikey, ignore_index=True)
chembl_inchikey_df

100%|███████████████████████████████████████| 1003/1003 [12:03<00:00,  1.39it/s]


Unnamed: 0,CID,InChIKey,standardised_smiles
0,6883,MOYKHGMNXAOIAT-JGWLITMVSA-N,O=[N+]([O-])O[C@H]1CO[C@H]2[C@@H]1OC[C@H]2O[N+...
1,3702,NDDAHWYSQHTHNT-UHFFFAOYSA-N,CC1Cc2ccccc2N1NC(=O)c1ccc(Cl)c(S(N)(=O)=O)c1
2,4114,QXKHYNVANLEOEG-UHFFFAOYSA-N,COc1c2occc2cc2ccc(=O)oc12
3,3038,GJQPMPFPNINLKP-UHFFFAOYSA-N,NS(=O)(=O)c1cc(Cl)c(Cl)c(S(N)(=O)=O)c1
4,3295,OUZWUKMCLIBBOG-UHFFFAOYSA-N,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
...,...,...,...
998,145722621,YOZLVAFWYLSRRN-VZXYPILPSA-N,C=CC(=O)N1C[C@H](Nc2ncnc3[nH]ccc23)CC[C@@H]1C....
999,133053456,GJMQTRCDSIQEFK-SCDRJROZSA-N,CC[C@@H]1CN(C(=O)NCC(F)(F)F)C[C@@H]1c1cnc2cnc3...
1000,156028087,UIYFGCAQGONAMU-ZHQCGWDOSA-N,CCCCC1(CCCC)CN(c2ccccc2)c2cc(SC)c(OCC(=O)N[C@@...
1001,90479083,QRHXYGPOQKLBJP-NPIFKJBVSA-N,O.O=C(NN1C(=O)[C@@H]2[C@@H]3C=C[C@@H]([C@H]4C[...


In [21]:
chembl_inchikey_df = chembl_inchikey_df.drop('CID', axis=1)

chembl_mito_safe_nontoxic = pd.merge(chembl_mito_safe_filt, chembl_inchikey_df, on='standardised_smiles')

chembl_mito_safe_nontoxic = chembl_mito_safe_nontoxic.rename(columns={'pref_name':'chemical name'})

chembl_mito_safe_nontoxic = chembl_mito_safe_nontoxic[['chemical name', 'InChIKey', 'standardised_smiles', 'source', 'mitochondrial toxic']]
chembl_mito_safe_nontoxic

Unnamed: 0,chemical name,InChIKey,standardised_smiles,source,mitochondrial toxic
0,ISOSORBIDE DINITRATE,MOYKHGMNXAOIAT-JGWLITMVSA-N,O=[N+]([O-])O[C@H]1CO[C@H]2[C@@H]1OC[C@H]2O[N+...,chembl_mito_safe_drugs,non-toxic
1,INDAPAMIDE,NDDAHWYSQHTHNT-UHFFFAOYSA-N,CC1Cc2ccccc2N1NC(=O)c1ccc(Cl)c(S(N)(=O)=O)c1,chembl_mito_safe_drugs,non-toxic
2,METHOXSALEN,QXKHYNVANLEOEG-UHFFFAOYSA-N,COc1c2occc2cc2ccc(=O)oc12,chembl_mito_safe_drugs,non-toxic
3,DICHLORPHENAMIDE,GJQPMPFPNINLKP-UHFFFAOYSA-N,NS(=O)(=O)c1cc(Cl)c(Cl)c(S(N)(=O)=O)c1,chembl_mito_safe_drugs,non-toxic
4,ETHOXZOLAMIDE,OUZWUKMCLIBBOG-UHFFFAOYSA-N,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,chembl_mito_safe_drugs,non-toxic
...,...,...,...,...,...
998,RITLECITINIB TOSYLATE,YOZLVAFWYLSRRN-VZXYPILPSA-N,C=CC(=O)N1C[C@H](Nc2ncnc3[nH]ccc23)CC[C@@H]1C....,chembl_mito_safe_drugs,non-toxic
999,UPADACITINIB HEMIHYDRATE,GJMQTRCDSIQEFK-SCDRJROZSA-N,CC[C@@H]1CN(C(=O)NCC(F)(F)F)C[C@@H]1c1cnc2cnc3...,chembl_mito_safe_drugs,non-toxic
1000,ODEVIXIBAT SESQUIHYDRATE,UIYFGCAQGONAMU-ZHQCGWDOSA-N,CCCCC1(CCCC)CN(c2ccccc2)c2cc(SC)c(OCC(=O)N[C@@...,chembl_mito_safe_drugs,non-toxic
1001,TECOVIRIMAT MONOHYDRATE,QRHXYGPOQKLBJP-NPIFKJBVSA-N,O.O=C(NN1C(=O)[C@@H]2[C@@H]3C=C[C@@H]([C@H]4C[...,chembl_mito_safe_drugs,non-toxic


In [22]:
#chembl_mito_safe_nontoxic.to_csv('../AL00_datasets/chembl_mito_safe_non_toxic_filt_before_join.csv', index=False)

# Join Tox21 MMP and CheMBL mito safe dataframes together

In [23]:
AL_training = pd.concat([mmp_toxic_filt, chembl_mito_safe_nontoxic], ignore_index=True, sort=False)

AL_training = AL_training.drop_duplicates(subset='InChIKey', keep="first") # ensures that duplicates are handled as 'toxic' as retains first instance from the MMP data (toxic dataset)

AL_training

Unnamed: 0,chemical name,InChIKey,standardised_smiles,source,mitochondrial toxic
0,ESTRADIOL CYPIONATE,UOACKFBJUYNSLK-XRKIENNPSA-N,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,tox21_mmp,toxic
1,TERT-BUTYLHYDROQUINONE,BGNXCDMCOKJUMV-UHFFFAOYSA-N,CC(C)(C)c1cc(O)ccc1O,tox21_mmp,toxic
2,MALACHITE GREEN OXALATE,CNYGFPPAGUCRIC-UHFFFAOYSA-L,CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)c...,tox21_mmp,toxic
3,5424-37-3,GKQYGRWQCWWSHN-UHFFFAOYSA-N,Cc1cc(N)c2cc(NC(=O)Nc3ccc4nc(C)cc(N)c4c3)ccc2n...,tox21_mmp,toxic
4,BUCLIZINE DIHYDROCHLORIDE,SDBHDSZKNVDKNU-UHFFFAOYSA-N,CC(C)(C)c1ccc(CN2CCN(C(c3ccccc3)c3ccc(Cl)cc3)C...,tox21_mmp,toxic
...,...,...,...,...,...
2006,RITLECITINIB TOSYLATE,YOZLVAFWYLSRRN-VZXYPILPSA-N,C=CC(=O)N1C[C@H](Nc2ncnc3[nH]ccc23)CC[C@@H]1C....,chembl_mito_safe_drugs,non-toxic
2007,UPADACITINIB HEMIHYDRATE,GJMQTRCDSIQEFK-SCDRJROZSA-N,CC[C@@H]1CN(C(=O)NCC(F)(F)F)C[C@@H]1c1cnc2cnc3...,chembl_mito_safe_drugs,non-toxic
2008,ODEVIXIBAT SESQUIHYDRATE,UIYFGCAQGONAMU-ZHQCGWDOSA-N,CCCCC1(CCCC)CN(c2ccccc2)c2cc(SC)c(OCC(=O)N[C@@...,chembl_mito_safe_drugs,non-toxic
2009,TECOVIRIMAT MONOHYDRATE,QRHXYGPOQKLBJP-NPIFKJBVSA-N,O.O=C(NN1C(=O)[C@@H]2[C@@H]3C=C[C@@H]([C@H]4C[...,chembl_mito_safe_drugs,non-toxic


In [24]:
AL_training['mitochondrial toxic'].value_counts()

toxic        1008
non-toxic     970
Name: mitochondrial toxic, dtype: int64

In [25]:
#AL_training.to_csv('../AL00_datasets/AL_training_data_tox21_mmp_chembl_mito_safe_join.csv', index=False)