In [1]:
import os
import subprocess
import pandas as pd

In [2]:
dir_outputs = os.path.join(os.getcwd(), 'results')
os.makedirs(dir_outputs) if not os.path.exists(dir_outputs) else print(f'{dir_outputs} is existing')

/mnt/data0/Research/5_Automation/mmp/rdkit/ADMETox_Update_1_2024Aug27/results is existing


#### 1. Prepare the SMILES file and property CSV file

In [3]:
dataTable_raw = pd.read_csv(f'./Data_ADMET_4_MMP_2024Aug27.csv', low_memory=False)

colName_mid = 'Compound Name'
colName_smi = 'Structure'    # 'Smiles'
colName_proj = 'Concat;Project'

dataTable_raw = dataTable_raw.dropna(subset=[colName_mid, colName_smi]).reset_index(drop=True)
print(f'There are total {dataTable_raw.shape[0]} molecules in the table with Structure(SMILES)')
dataTable_raw.head(3)

There are total 25714 molecules in the table with Structure(SMILES)


Unnamed: 0,Compound Name,Structure,Smiles,Concat;Project,Concat;External Id,Created On,Log D,ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Num),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Mod),...,Marked,logD_CDD,F%_Rat,EstFa_Rat,permeability,efflux,hERG_IC50,hERG_eIC50,hERG_mixedIC50,ambitiousData
0,KT-0026812,C[C@@H]%10CNc2c(sc3ccc1nc(ccc1c32)-c4ccc(cc4)N...,C[C@@H]1CNC2=C(SC3=CC=C4N=C(C5=CC=C(N6CCC(CN7C...,MK2,MK2-361-001F,12-Aug-2021,5.59691,,,,...,UNMARKED,5.59691,13.8,-0.17493,,,,45.74136,45.74136,1.0
1,KT-0026813,CN2C(=O)N(C1CCC(=O)NC1=O)c3cccc(c32)N%10CCC(CN...,CN1C(=O)N(C2CCC(=O)NC2=O)C2=CC=CC(N3CCC(CN4CCC...,MK2,MK2-367-001N,12-Aug-2021,2.20623,,,,...,UNMARKED,2.20623,0.0,0.0,,,,,,
2,KT-0026814,Fc1c(cccc1)-c5nc4c(c2c(sc3c2NC[C@H](NC3=O)C)cc...,N1C(=O)C2=C(C3=C(C=CC4=NC(C5=CC=CC=C5F)=CC=C43...,MK2,MK2-368-001H,12-Aug-2021,3.83864,,,,...,UNMARKED,3.83864,,,,,,,,


In [4]:
colName_prop_list = ['F%_Rat', 'EstFa_Rat', 'permeability', 'efflux', 'hERG_IC50', 'hERG_mixedIC50', 'logD_CDD']

## the SMILES file for fragmentation
file_smi = f'{dir_outputs}/Compounds_All.smi'
file_prop_csv = f'{dir_outputs}/Property_All.csv'
delimiter = ' '

data_dict_prop = {}
with open(file_smi, "w") as output_file:
    # output_file.write(f'SMILES{delimiter}ID' + "\n")
    for idx in dataTable_raw.index:
        mol_id = dataTable_raw[colName_mid][idx]
        mol_smi = dataTable_raw[colName_smi][idx]

        ## prepare the SMILES output
        this_line = f'{mol_smi}{delimiter}{mol_id}'
        output_file.write(this_line + "\n")  # Add a newline character after each string

        ## prepare the property CSV output
        data_dict_prop[idx] = {}
        data_dict_prop[idx]['ID'] = mol_id

        for prop_name in colName_prop_list:
            try:
                if dataTable_raw[prop_name].notna()[idx]:
                    mol_prop = float(dataTable_raw[prop_name][idx])
                else:
                    mol_prop = "*"
            except Exception as e:
                data_dict_prop[idx][prop_name] = "*"
                print(f'This mol {mol_id} does not have a proper property value: {e}')
            else:
                data_dict_prop[idx][prop_name] = mol_prop
    print(f'The SMILES strings have been saved into file: {file_smi}')
    
## save the csv results
data_table_prop = pd.DataFrame.from_dict(data_dict_prop).T
data_table_prop.to_csv(file_prop_csv, index=False, sep=delimiter)
print(data_table_prop.shape)
data_table_prop.head(3)

The SMILES strings have been saved into file: /mnt/data0/Research/5_Automation/mmp/rdkit/ADMETox_Update_1_2024Aug27/results/Compounds_All.smi
(25714, 8)


Unnamed: 0,ID,F%_Rat,EstFa_Rat,permeability,efflux,hERG_IC50,hERG_mixedIC50,logD_CDD
0,KT-0026812,13.8,-0.17493,*,*,*,45.74136,5.59691
1,KT-0026813,0.0,0.0,*,*,*,*,2.20623
2,KT-0026814,*,*,*,*,*,*,3.83864


#### 2. Fragment the SMILES

In [5]:
file_fragdb = f'{dir_outputs}/Compounds_All.fragdb'

commandLine = ['mmpdb', 'fragment', file_smi, '-o', file_fragdb]
process = subprocess.Popen(commandLine, stdout=subprocess.PIPE)
output, error = process.communicate()
print(f'The fragmentation is completed and saved into file {file_fragdb}')

Preparing record 18631[00:49:04] Conflicting single bond directions around double bond at index 52.
[00:49:04]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:49:04] Conflicting single bond directions around double bond at index 51.
[00:49:04]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:49:04] Conflicting single bond directions around double bond at index 50.
[00:49:04]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:49:04] Conflicting single bond directions around double bond at index 49.
[00:49:04]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:49:04] Conflicting single bond directions around double bond at index 22.
[00:49:04]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:49:04] Conflicting single bond directions around double bond at index 49.
[00:49:04]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:49:04] Conflic

The fragmentation is completed and saved into file /mnt/data0/Research/5_Automation/mmp/rdkit/ADMETox_Update_1_2024Aug27/results/Compounds_All.fragdb


                    

#### 3. Indexing to find the matched molecular pairs in the fragment file
#### 4. Load the activity/property data

In [6]:
file_mmpdb = f'{dir_outputs}/Compounds_All.mmpdb'

commandLine = ['mmpdb', 'index', file_fragdb, '-o', file_mmpdb, '--properties', file_prop_csv]
process = subprocess.Popen(commandLine, stdout=subprocess.PIPE)
output, error = process.communicate()
print(f'The indexing/mmp generation is completed and saved into file {file_mmpdb}')

                                                                            

The indexing/mmp generation is completed and saved into file /mnt/data0/Research/5_Automation/mmp/rdkit/ADMETox_Update_1_2024Aug27/results/Compounds_All.mmpdb


In [7]:
!mmpdb list ./results/Compounds_All.mmpdb

             Name             #cmpds #rules  #pairs  #envs   #stats  |-------------------------------------------------- Title ---------------------------------------------------| Properties
./results/Compounds_All.mmpdb  12288 206071 1562826 1350369 1548581  MMPs from '/mnt/data0/Research/5_Automation/mmp/rdkit/ADMETox_Update_1_2024Aug27/results/Compounds_All.fragdb' F%_Rat EstFa_Rat permeability efflux hERG_IC50 hERG_mixedIC50 logD_CDD


In [None]:
# !mmpdb --help
# !mmpdb help-admin
# !mmpdb index --help

In [None]:
# !mmpdb rulecat --help
# !mmpdb rulecat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_rulecat.csv

# !mmpdb ruleenvcat --help
# !mmpdb ruleenvcat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_ruleenvcat.csv

# !mmpdb propcat --help
# !mmpdb propcat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_propcat.csv

# !mmpdb proprulecat --help
# !mmpdb proprulecat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_proprulecat.csv