In [1]:
import os
import subprocess
import pandas as pd

In [2]:
dir_outputs = os.path.join(os.getcwd(), 'results')
os.makedirs(dir_outputs) if not os.path.exists(dir_outputs) else print(f'{dir_outputs} is existing')

/mnt/data0/Research/5_Automation/mmp/rdkit/ADMET/results is existing


#### 1. Prepare the SMILES file and property CSV file

In [3]:
dataTable_raw = pd.read_csv(f'./Data_ADMET_4_MMP.csv')

colName_mid = 'Compound Name'
colName_smi = 'Structure'
colName_proj = 'Concat;Project'

dataTable_raw = dataTable_raw.dropna(subset=[colName_mid, colName_smi]).reset_index(drop=True)
print(f'There are total {dataTable_raw.shape[0]} molecules in the table with Structure(SMILES)')
dataTable_raw.head(3)

There are total 6150 molecules in the table with Structure(SMILES)


Unnamed: 0,Compound Name,Structure,Concat;Project,Concat;External Id,ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Num),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Num),ADME MDCK(WT) Permeability;Concat;Comments,ADME MDCK(WT) Permeability;Concat;Run Date,...,ADME Tox-manual patch hERG 34C;Mean;SD;(Mod),ADME Tox-manual patch hERG 34C;Mean;SD;(Num),F%_Rat,EstFa_Rat,permeability,efflux,hERG_IC50,hERG_eIC50,hERG_mixedIC50,ambitiousData
0,KT-0194988,C3(CCN(C(=O)c1cc(c(c(c1)N2CCC(=O)NC2=O)Cl)C)CC...,TYK2,PH-CMR-TK2-2575-0N-001,,,,,,,...,=,0.467074,10.8,0.125258,,,,11.044944,11.044944,1.0
1,KT-0194990,N1(CCC(=O)NC1=O)c2cc(cc(c2C)Cl)C(=O)N%11CCC%10...,TYK2,PH-CMR-TK2-2610-0N-001,,,,,,,...,=,1.265248,3.87,0.048645,,,,9.25991,9.25991,1.0
2,KT-0194991,CNc1cc(nn2c(cnc21)C(=O)N[C@H]3[C@H](OC)CC3)N5c...,TYK2,PH-CMR-TK2-2615-0N-001,,,,,,,...,=,4.445823,,,,,5.19295,,5.19295,0.0


In [4]:
colName_prop_list = ['F%_Rat', 'EstFa_Rat', 'permeability', 'efflux', 'hERG_IC50', 'hERG_mixedIC50']

## the SMILES file for fragmentation
file_smi = f'{dir_outputs}/Compounds_All.smi'
file_prop_csv = f'{dir_outputs}/Property_All.csv'
delimiter = ' '

data_dict_prop = {}
with open(file_smi, "w") as output_file:
    # output_file.write(f'SMILES{delimiter}ID' + "\n")
    for idx in dataTable_raw.index:
        mol_id = dataTable_raw[colName_mid][idx]
        mol_smi = dataTable_raw[colName_smi][idx]

        ## prepare the SMILES output
        this_line = f'{mol_smi}{delimiter}{mol_id}'
        output_file.write(this_line + "\n")  # Add a newline character after each string

        ## prepare the property CSV output
        data_dict_prop[idx] = {}
        data_dict_prop[idx]['ID'] = mol_id

        for prop_name in colName_prop_list:
            try:
                if dataTable_raw[prop_name].notna()[idx]:
                    mol_prop = float(dataTable_raw[prop_name][idx])
                else:
                    mol_prop = "*"
            except Exception as e:
                data_dict_prop[idx][prop_name] = "*"
                print(f'This mol {mol_id} does not have a proper property value: {e}')
            else:
                data_dict_prop[idx][prop_name] = mol_prop
    print(f'The SMILES strings have been saved into file: {file_smi}')
    
## save the csv results
data_table_prop = pd.DataFrame.from_dict(data_dict_prop).T
data_table_prop.to_csv(file_prop_csv, index=False, sep=delimiter)
data_table_prop.head(3)

The SMILES strings have been saved into file: /mnt/data0/Research/5_Automation/mmp/rdkit/ADMET/results/Compounds_All.smi


Unnamed: 0,ID,F%_Rat,EstFa_Rat,permeability,efflux,hERG_IC50,hERG_mixedIC50
0,KT-0194988,10.8,0.125258,*,*,*,11.044944
1,KT-0194990,3.87,0.048645,*,*,*,9.25991
2,KT-0194991,*,*,*,*,5.19295,5.19295


#### 2. Fragment the SMILES

In [5]:
file_fragdb = f'{dir_outputs}/Compounds_All.fragdb'

commandLine = ['mmpdb', 'fragment', file_smi, '-o', file_fragdb]
process = subprocess.Popen(commandLine, stdout=subprocess.PIPE)
output, error = process.communicate()
print(f'The fragmentation is completed and saved into file {file_fragdb}')

Preparing record 1989[13:17:14] Conflicting single bond directions around double bond at index 57.
[13:17:14]   BondStereo set to STEREONONE and single bond directions set to NONE.
[13:17:14] Conflicting single bond directions around double bond at index 22.
[13:17:14]   BondStereo set to STEREONONE and single bond directions set to NONE.
Preparing record 2334[13:17:15] Can't kekulize mol.  Unkekulized atoms: 2 4 5
[13:17:15] Conflicting single bond directions around double bond at index 40.
[13:17:15]   BondStereo set to STEREONONE and single bond directions set to NONE.
[13:17:15] Conflicting single bond directions around double bond at index 13.
[13:17:15]   BondStereo set to STEREONONE and single bond directions set to NONE.
[13:17:15] Conflicting single bond directions around double bond at index 22.
[13:17:15]   BondStereo set to STEREONONE and single bond directions set to NONE.
[13:17:15] Conflicting single bond directions around double bond at index 22.
[13:17:15]   BondStereo

The fragmentation is completed and saved into file /mnt/data0/Research/5_Automation/mmp/rdkit/ADMET/results/Compounds_All.fragdb


                    

#### 3. Indexing to find the matched molecular pairs in the fragment file
#### 4. Load the activity/property data

In [6]:
file_mmpdb = f'{dir_outputs}/Compounds_All.mmpdb'

commandLine = ['mmpdb', 'index', file_fragdb, '-o', file_mmpdb, '--properties', file_prop_csv]
process = subprocess.Popen(commandLine, stdout=subprocess.PIPE)
output, error = process.communicate()
print(f'The indexing/mmp generation is completed and saved into file {file_mmpdb}')

                                                                          

The indexing/mmp generation is completed and saved into file /mnt/data0/Research/5_Automation/mmp/rdkit/ADMET/results/Compounds_All.mmpdb


In [7]:
!mmpdb list ./results/Compounds_All.mmpdb

             Name             #cmpds #rules #pairs #envs  #stats  |---------------------------------------- Title ----------------------------------------| Properties
./results/Compounds_All.mmpdb   2876  18308 195300 126867 193582  MMPs from '/mnt/data0/Research/5_Automation/mmp/rdkit/ADMET/results/Compounds_All.fragdb' F%_Rat EstFa_Rat permeability efflux hERG_IC50 hERG_mixedIC50


In [None]:
# !mmpdb --help
# !mmpdb help-admin
# !mmpdb index --help

In [None]:
# !mmpdb rulecat --help
# !mmpdb rulecat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_rulecat.csv

# !mmpdb ruleenvcat --help
# !mmpdb ruleenvcat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_ruleenvcat.csv

# !mmpdb propcat --help
# !mmpdb propcat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_propcat.csv

# !mmpdb proprulecat --help
# !mmpdb proprulecat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_proprulecat.csv