In [None]:
import os
import subprocess
import pandas as pd

In [None]:
dir_outputs = os.path.join(os.getcwd(), 'results')
os.makedirs(dir_outputs) if not os.path.exists(dir_outputs) else print(f'{dir_outputs} is existing')

#### 1. Prepare the SMILES file and property CSV file

In [None]:
dataTable_raw = pd.read_csv(f'./Data_4_MMP_2024Dec10.csv', low_memory=False)

colName_mid = 'Compound Name'
colName_smi = 'Structure'    # 'Smiles'
colName_proj = 'Concat;Project'

dataTable_raw = dataTable_raw.dropna(subset=[colName_mid, colName_smi]).reset_index(drop=True)
dataTable_raw = dataTable_raw.rename(columns={'Molecular Weight': 'MW'})
print(f'There are total {dataTable_raw.shape[0]} molecules in the table with Structure(SMILES)')
dataTable_raw.head(3)

In [None]:
colName_prop_list = ['MW', 'F%_Rat', 'EstFa_Rat', 'permeability', 'efflux', 'hERG_IC50', 'hERG_mixedIC50']

## the SMILES file for fragmentation
file_smi = f'{dir_outputs}/Compounds_All.smi'
file_prop_csv = f'{dir_outputs}/Property_All.csv'
delimiter = ' '

data_dict_prop = {}
with open(file_smi, "w") as output_file:
    # output_file.write(f'SMILES{delimiter}ID' + "\n")
    for idx in dataTable_raw.index:
        mol_id = dataTable_raw[colName_mid][idx]
        mol_smi = dataTable_raw[colName_smi][idx]

        ## prepare the SMILES output
        this_line = f'{mol_smi}{delimiter}{mol_id}'
        output_file.write(this_line + "\n")  # Add a newline character after each string

        ## prepare the property CSV output
        data_dict_prop[idx] = {}
        data_dict_prop[idx]['ID'] = mol_id

        for prop_name in colName_prop_list:
            try:
                if dataTable_raw[prop_name].notna()[idx]:
                    mol_prop = float(dataTable_raw[prop_name][idx])
                else:
                    mol_prop = "*"
            except Exception as e:
                data_dict_prop[idx][prop_name] = "*"
                # print(f'This mol {mol_id} does not have a proper property value: {e}')
            else:
                data_dict_prop[idx][prop_name] = mol_prop
    print(f'The SMILES strings have been saved into file: {file_smi}')
    
## save the csv results
data_table_prop = pd.DataFrame.from_dict(data_dict_prop).T
data_table_prop.to_csv(file_prop_csv, index=False, sep=delimiter)
print(data_table_prop.shape)
data_table_prop.head(3)

#### 2. Fragment the SMILES

In [None]:
file_fragdb = f'{dir_outputs}/Compounds_All.fragdb'

commandLine = ['mmpdb', 'fragment', file_smi, '-o', file_fragdb]
process = subprocess.Popen(commandLine, stdout=subprocess.PIPE)
output, error = process.communicate()
print(f'The fragmentation is completed and saved into file {file_fragdb}')

#### 3. Indexing to find the matched molecular pairs in the fragment file
#### 4. Load the activity/property data

In [None]:
file_mmpdb = f'{dir_outputs}/Compounds_All.mmpdb'

commandLine = ['mmpdb', 'index', file_fragdb, '-o', file_mmpdb, '--properties', file_prop_csv]
process = subprocess.Popen(commandLine, stdout=subprocess.PIPE)
output, error = process.communicate()
print(f'The indexing/mmp generation is completed and saved into file {file_mmpdb}')

In [None]:
!mmpdb list ./results/Compounds_All.mmpdb

In [None]:
# !mmpdb --help
# !mmpdb help-admin
# !mmpdb index --help

In [None]:
# !mmpdb rulecat --help
# !mmpdb rulecat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_rulecat.csv

# !mmpdb ruleenvcat --help
# !mmpdb ruleenvcat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_ruleenvcat.csv

# !mmpdb propcat --help
# !mmpdb propcat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_propcat.csv

# !mmpdb proprulecat --help
# !mmpdb proprulecat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_proprulecat.csv

In [None]:
test_smi = "CCN(C)C(=O)C1=CC2=C(N1)C(F)=CN=C2C1=C(Cl)C=C(N2CCC(CN3CCN(C4=CC=CC5=C4N(C)C(=O)N5C4CCC(=O)NC4=O)CC3)CC2)C=C1"

In [None]:
!mmpdb generate --smiles 'CCN(C)C(=O)C1=CC2=C(N1)C(F)=CN=C2C1=C(Cl)C=C(N2CCC(CN3CCN(C4=CC=CC5=C4N(C)C(=O)N5C4CCC(=O)NC4=O)CC3)CC2)C=C1' ./results/Compounds_All.mmpdb