In [1]:
import os
import subprocess
import pandas as pd

In [2]:
dir_outputs = os.path.join(os.getcwd(), 'results')
os.makedirs(dir_outputs) if not os.path.exists(dir_outputs) else print(f'{dir_outputs} is existing')

/mnt/data0/Research/5_Automation/mmp/rdkit/Test_1_hERG_2024Jun12/results is existing


#### 1. Prepare the SMILES file and property CSV file

In [3]:
dataTable_raw = pd.read_csv(f'./hERG_regression_All_1956_2024May08.csv')
dataTable_raw = dataTable_raw.dropna(subset=['Molecule Name', 'SMILES']).reset_index(drop=True)
print(f'There are total {dataTable_raw.shape[0]} molecules in the table with SMILES')
dataTable_raw.head(3)

There are total 1956 molecules in the table with SMILES


Unnamed: 0,Molecule Name,proj_mcf,SMILES,Molecular weight (g/mol),Projects,Batch Name,Batch Molecule-Batch ID,Projects_sorted,hERG_IC50_list,hERG_IC50_cmts,hERG_inhibition_details,ambitiousData,hERG_eIC50_list,hERG_IC50_mean,hERG_assay_date,hERG_IC50_range,hERG_eIC50_mean,hERG_IC50_merged_uM,hERG_pIC50_merged
0,KT-0003545,IRAK4,[C@H]1(N2C=C3C=C(NC(=O)C4=CC=CC(C(F)(F)F)=N4)C...,843.952,"CRBN MGD Library, IRAK4",13,KT-0003545-013,"CRBN MGD Library, IRAK4","[['12.343', '3/29/2022'], ['12.343', '3/29/202...",[],{},0,[],12.343,3/29/2022,,,12.343,4.908579
1,KT-0004511,IRAK4,[C@H]1(N2C=C3C=C(NC(=O)C4=CC=CC(C(F)(F)F)=N4)C...,849.862,IRAK4,3,KT-0004511-003,IRAK4,"[['1.717', '6/27/2022']]",[],{},0,[],1.717,6/27/2022,,,1.717,5.76523
2,KT-0005653,IRAK4,C(C1=C2C(=CC=C1)N(C1C(=O)N(C)C(=O)CC1)C(=O)N2C...,879.97,IRAK4,2,KT-0005653-002,IRAK4,"[['6.240', '5/16/2022']]",[],{},0,[],6.24,5/16/2022,,,6.24,5.204815


In [4]:
colName_mid = 'Molecule Name'
colName_smi = 'SMILES'
colName_proj = 'proj_mcf'
colName_prop = 'hERG_pIC50_merged'


## the SMILES file for fragmentation
file_smi = f'{dir_outputs}/hERG_All_1956_2024Jun14.smi'
file_prop_csv = f'{dir_outputs}/hERG_All_1956_2024Jun14_property.csv'
delimiter = ' '

data_prop_dict = {}
with open(file_smi, "w") as output_file:
    # output_file.write(f'SMILES{delimiter}ID' + "\n")
    for idx in dataTable_raw.index:
        mol_id = dataTable_raw[colName_mid][idx]
        mol_smi = dataTable_raw[colName_smi][idx]

        ## prepare the SMILES output
        this_line = f'{mol_smi}{delimiter}{mol_id}'
        output_file.write(this_line + "\n")  # Add a newline character after each string

        ## prepare the property CSV output
        data_prop_dict[idx] = {}
        data_prop_dict[idx]['ID'] = mol_id
        prop_name = colName_prop
        try:
            mol_prop = float(dataTable_raw[colName_prop][idx])
        except Exception as e:
            data_prop_dict[idx][prop_name] = "*"
            print(f'This mol {mol_id} does not have a proper property value: {e}')
        else:
            data_prop_dict[idx][prop_name] = mol_prop
    print(f'The SMILES strings have been saved into file: {file_smi}')
    
## save the csv results
data_prop_dict = pd.DataFrame.from_dict(data_prop_dict).T
data_prop_dict.to_csv(file_prop_csv, index=False, sep=delimiter)

The SMILES strings have been saved into file: /mnt/data0/Research/5_Automation/mmp/rdkit/Test_1_hERG_2024Jun12/results/hERG_All_1956_2024Jun14.smi


#### 2. Fragment the SMILES

In [5]:
file_fragdb = f'{dir_outputs}/hERG_All_1956_2024Jun14.fragdb'

commandLine = ['mmpdb', 'fragment', file_smi, '-o', file_fragdb]
process = subprocess.Popen(commandLine, stdout=subprocess.PIPE)
output, error = process.communicate()
print(f'The fragmentation is completed and saved into file {file_fragdb}')

                                   

The fragmentation is completed and saved into file /mnt/data0/Research/5_Automation/mmp/rdkit/Test_1_hERG_2024Jun12/results/hERG_All_1956_2024Jun14.fragdb


                    

#### 3. Indexing to find the matched molecular pairs in the fragment file
#### 4. Load the activity/property data

In [6]:
file_mmpdb = f'{dir_outputs}/hERG_All_1956_2024Jun14.mmpdb'

commandLine = ['mmpdb', 'index', file_fragdb, '-o', file_mmpdb, '--properties', file_prop_csv]
process = subprocess.Popen(commandLine, stdout=subprocess.PIPE)
output, error = process.communicate()
print(f'The indexing/mmp generation is completed and saved into file {file_mmpdb}')
!mmpdb list ./results/hERG_All_1956_2024Jun14.mmpdb

                                                                           

The indexing/mmp generation is completed and saved into file /mnt/data0/Research/5_Automation/mmp/rdkit/Test_1_hERG_2024Jun12/results/hERG_All_1956_2024Jun14.mmpdb
                  Name                  #cmpds #rules #pairs #envs  #stats  |----------------------------------------------------- Title -----------------------------------------------------| Properties
./results/hERG_All_1956_2024Jun14.mmpdb   1009   7235  82680  50311  50311  MMPs from '/mnt/data0/Research/5_Automation/mmp/rdkit/Test_1_hERG_2024Jun12/results/hERG_All_1956_2024Jun14.fragdb' hERG_pIC50_merged


In [None]:
# !mmpdb --help
# !mmpdb help-admin
# !mmpdb index --help

In [1]:
!mmpdb rulecat --help
!mmpdb rulecat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_rulecat.csv

Usage: mmpdb rulecat [OPTIONS] DATABASE

  Show the rules in an mmpdb file

Options:
  -o, --output FILENAME  Write the rules to the named file (default is stdout)
  --help                 Show this message and exit.


In [2]:
!mmpdb ruleenvcat --help
!mmpdb ruleenvcat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_ruleenvcat.csv

Usage: mmpdb ruleenvcat [OPTIONS] DATABASE

  Show the rules in an mmpdb file

Options:
  --pairs / --no-pairs   With --pairs, include pairs in the output
  -o, --output FILENAME  Write the rules to the named file (default is stdout)
  --help                 Show this message and exit.


In [8]:
# !mmpdb propcat --help
# !mmpdb propcat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_propcat.csv

Usage: mmpdb propcat [OPTIONS] DATABASE

  Write the database properties to a properties file

  DATABASE: an mmpdb file

Options:
  --no-properties        Don't use any properties
  -p, --property NAME    Property to use (may be specified multiple times)
  --all                  Include compounds which have no properties
  -o, --output FILENAME  Output filename (default is stdout)
  --help                 Show this message and exit.

 Write information about the properties for the compounds in DATABASE,
 formatted as a property file. Use `mmpdb help-property-file` for details
 about the property file format.

 The output from this command is a tab-delimited CSV file where the first
 column has the head "ID" and contains the compound identifier. The other
 columns contain property information for each compound. The column title is
 the property name.

 By default there is one column for each property in the databases, and the
 one row for each compound with at least one property. Use '

In [7]:
!mmpdb proprulecat --help
!mmpdb proprulecat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_proprulecat.csv

Usage: mmpdb proprulecat [OPTIONS] DATABASE

  Write the property rules to stdout or a file

Options:
  --from SMILES                   SMILES for one side of the transformation
  --to SMILES                     SMILES for the other side of the
                                  transformation
  --canonicalize / --no-canonicalize
                                  Use the --from and --to strings as-is; do
                                  not canonicalize them (default:
                                  --canonicalize)
  -p, --property NAME             Property to use (may be specified multiple
                                  times)
  --min-count N                   Only show rules with at least N pairs
  -o, --output FILENAME           Write the output to the given file (default
                                  is stdout)
  --help                          Show this message and exit.
