In [5]:
import pandas as pd

import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

In [6]:
# Create a function to generate molecular descriptors (MW, LogP, HBD, HBA, TPSA, etc.) using rdkit

def EDAdescriptors(smiles, verbose=False):
  moldata = []
  for elem in smiles:
    mol = Chem.MolFromSmiles(elem)
    moldata.append(mol)

  data = []

  for mol, smiles_orig in zip(moldata, smiles):
    if mol is not None:
        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_MolMR = Descriptors.MolMR(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
        desc_TPSA = Descriptors.TPSA(mol)
        desc_HeavyAtomCount = Descriptors.HeavyAtomCount(mol)

        row = {"standardised_smiles": smiles_orig,  
               "MW": desc_MolWt, 
               "LogP": desc_MolLogP, 
               "Crippen MR": desc_MolMR, 
               "NHD": desc_NumHDonors, 
               "NHA": desc_NumHAcceptors, 
               "TPSA": desc_TPSA, 
               "HA Count": desc_HeavyAtomCount}
    
        data.append(row)

  descriptors = pd.DataFrame(data)
    
  return descriptors

# AL training data RDKit phys-chem prop.

In [7]:
al_training = pd.read_csv('../AL00_datasets/AL_training_data_tox21_mmp_chembl_mito_safe_join.csv')
al_training['standardised_smiles'] = al_training['standardised_smiles'].astype(str)
al_training

Unnamed: 0,chemical name,InChIKey,standardised_smiles,source,mitochondrial toxic
0,ESTRADIOL CYPIONATE,UOACKFBJUYNSLK-XRKIENNPSA-N,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,tox21_mmp,toxic
1,TERT-BUTYLHYDROQUINONE,BGNXCDMCOKJUMV-UHFFFAOYSA-N,CC(C)(C)c1cc(O)ccc1O,tox21_mmp,toxic
2,MALACHITE GREEN OXALATE,CNYGFPPAGUCRIC-UHFFFAOYSA-L,CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)c...,tox21_mmp,toxic
3,5424-37-3,GKQYGRWQCWWSHN-UHFFFAOYSA-N,Cc1cc(N)c2cc(NC(=O)Nc3ccc4nc(C)cc(N)c4c3)ccc2n...,tox21_mmp,toxic
4,BUCLIZINE DIHYDROCHLORIDE,SDBHDSZKNVDKNU-UHFFFAOYSA-N,CC(C)(C)c1ccc(CN2CCN(C(c3ccccc3)c3ccc(Cl)cc3)C...,tox21_mmp,toxic
...,...,...,...,...,...
1973,RITLECITINIB TOSYLATE,YOZLVAFWYLSRRN-VZXYPILPSA-N,C=CC(=O)N1C[C@H](Nc2ncnc3[nH]ccc23)CC[C@@H]1C....,chembl_mito_safe_drugs,non-toxic
1974,UPADACITINIB HEMIHYDRATE,GJMQTRCDSIQEFK-SCDRJROZSA-N,CC[C@@H]1CN(C(=O)NCC(F)(F)F)C[C@@H]1c1cnc2cnc3...,chembl_mito_safe_drugs,non-toxic
1975,ODEVIXIBAT SESQUIHYDRATE,UIYFGCAQGONAMU-ZHQCGWDOSA-N,CCCCC1(CCCC)CN(c2ccccc2)c2cc(SC)c(OCC(=O)N[C@@...,chembl_mito_safe_drugs,non-toxic
1976,TECOVIRIMAT MONOHYDRATE,QRHXYGPOQKLBJP-NPIFKJBVSA-N,O.O=C(NN1C(=O)[C@@H]2[C@@H]3C=C[C@@H]([C@H]4C[...,chembl_mito_safe_drugs,non-toxic


In [8]:
al_training_rdkit = EDAdescriptors(al_training.standardised_smiles)
al_training_rdkit

Unnamed: 0,standardised_smiles,MW,LogP,Crippen MR,NHD,NHA,TPSA,HA Count
0,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,396.571,6.13050,113.7958,1,3,46.53,29
1,CC(C)(C)c1cc(O)ccc1O,166.220,2.39530,48.4716,2,2,40.46,12
2,CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)c...,927.020,3.58460,257.8140,4,10,241.96,68
3,Cc1cc(N)c2cc(NC(=O)Nc3ccc4nc(C)cc(N)c4c3)ccc2n...,445.354,5.05184,128.9222,4,5,118.95,30
4,CC(C)(C)c1ccc(CN2CCN(C(c3ccccc3)c3ccc(Cl)cc3)C...,505.961,7.38830,146.2250,0,2,6.48,33
...,...,...,...,...,...,...,...,...
1973,C=CC(=O)N1C[C@H](Nc2ncnc3[nH]ccc23)CC[C@@H]1C....,457.556,3.17702,122.8990,3,6,128.28,32
1974,CC[C@@H]1CN(C(=O)NCC(F)(F)F)C[C@@H]1c1cnc2cnc3...,778.763,4.99110,188.3986,4,8,188.14,55
1975,CCCCC1(CCCC)CN(c2ccccc2)c2cc(SC)c(OCC(=O)N[C@@...,1535.935,9.28390,407.0744,10,18,443.24,105
1976,O.O=C(NN1C(=O)[C@@H]2[C@@H]3C=C[C@@H]([C@H]4C[...,394.349,1.57860,89.0740,1,3,97.98,28


In [9]:
al_training_EDA = pd.merge(al_training, al_training_rdkit, on='standardised_smiles')
al_training_EDA

Unnamed: 0,chemical name,InChIKey,standardised_smiles,source,mitochondrial toxic,MW,LogP,Crippen MR,NHD,NHA,TPSA,HA Count
0,ESTRADIOL CYPIONATE,UOACKFBJUYNSLK-XRKIENNPSA-N,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,tox21_mmp,toxic,396.571,6.13050,113.7958,1,3,46.53,29
1,TERT-BUTYLHYDROQUINONE,BGNXCDMCOKJUMV-UHFFFAOYSA-N,CC(C)(C)c1cc(O)ccc1O,tox21_mmp,toxic,166.220,2.39530,48.4716,2,2,40.46,12
2,MALACHITE GREEN OXALATE,CNYGFPPAGUCRIC-UHFFFAOYSA-L,CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)c...,tox21_mmp,toxic,927.020,3.58460,257.8140,4,10,241.96,68
3,5424-37-3,GKQYGRWQCWWSHN-UHFFFAOYSA-N,Cc1cc(N)c2cc(NC(=O)Nc3ccc4nc(C)cc(N)c4c3)ccc2n...,tox21_mmp,toxic,445.354,5.05184,128.9222,4,5,118.95,30
4,BUCLIZINE DIHYDROCHLORIDE,SDBHDSZKNVDKNU-UHFFFAOYSA-N,CC(C)(C)c1ccc(CN2CCN(C(c3ccccc3)c3ccc(Cl)cc3)C...,tox21_mmp,toxic,505.961,7.38830,146.2250,0,2,6.48,33
...,...,...,...,...,...,...,...,...,...,...,...,...
1973,RITLECITINIB TOSYLATE,YOZLVAFWYLSRRN-VZXYPILPSA-N,C=CC(=O)N1C[C@H](Nc2ncnc3[nH]ccc23)CC[C@@H]1C....,chembl_mito_safe_drugs,non-toxic,457.556,3.17702,122.8990,3,6,128.28,32
1974,UPADACITINIB HEMIHYDRATE,GJMQTRCDSIQEFK-SCDRJROZSA-N,CC[C@@H]1CN(C(=O)NCC(F)(F)F)C[C@@H]1c1cnc2cnc3...,chembl_mito_safe_drugs,non-toxic,778.763,4.99110,188.3986,4,8,188.14,55
1975,ODEVIXIBAT SESQUIHYDRATE,UIYFGCAQGONAMU-ZHQCGWDOSA-N,CCCCC1(CCCC)CN(c2ccccc2)c2cc(SC)c(OCC(=O)N[C@@...,chembl_mito_safe_drugs,non-toxic,1535.935,9.28390,407.0744,10,18,443.24,105
1976,TECOVIRIMAT MONOHYDRATE,QRHXYGPOQKLBJP-NPIFKJBVSA-N,O.O=C(NN1C(=O)[C@@H]2[C@@H]3C=C[C@@H]([C@H]4C[...,chembl_mito_safe_drugs,non-toxic,394.349,1.57860,89.0740,1,3,97.98,28


In [10]:
#al_training_EDA.to_csv('../AL00_datasets/al_training_EDA_annotation.csv', index=False)