In [1]:
import os
import chardet
import subprocess
import pandas as pd

In [2]:
##############################################################################################
##################################### Custom Tools ###########################################
##############################################################################################
def determine_encoding(dataFile):
    # Step 1: Open the CSV file in binary mode
    with open(dataFile, 'rb') as f:
        data = f.read()
    # Step 2: Detect the encoding using the chardet library
    encoding_result = chardet.detect(data)
    # Step 3: Retrieve the encoding information
    encoding = encoding_result['encoding']
    # Step 4: Print/export the detected encoding information
    # print("Detected Encoding:", encoding)
    return encoding

def defineOutputFolder(fileName_out):
    if fileName_out is None:
        fileName_out = os.path.join(os.getcwd(), 'MMPs_results')
    
    os.makedirs(fileName_out) if not os.path.exists(fileName_out) else print(f'\t---->{fileName_out} is existing')
    return fileName_out

##############################################################################################
########################### Load original csv for MMPs analysis ##############################
##############################################################################################
def CSV_loader(fileName_in, colName_mid, colName_smi, colNames_activity, sep=','):
    print(f"1. Loading csv from {fileName_in}")
    assert os.path.exists(fileName_in), f"File {fileName_in} does not exist"
    ##
    encoding = determine_encoding(fileName_in)
    ##
    dataTable_raw = pd.read_csv(fileName_in, sep=sep, encoding=encoding)
    print(f"\tThe original csv file has {dataTable_raw.shape[0]} rows and {dataTable_raw.shape[1]} columns")
    print(f"\tColumn for compound ID is {colName_mid}")
    print(f"\tColumn for compound SMILES is {colName_smi}")

    colName_prop_list = colNames_activity.split(',')
    print(f"\tColumns for compound activity includes {colName_prop_list}")
    for prop_name in colName_prop_list:
        if prop_name not in dataTable_raw.columns:
            print(f"\t---->Warning! {prop_name} is not in the csv file, pls check ...")
            colName_prop_list.remove(prop_name)
    ##
    dataTable_raw = dataTable_raw.dropna(subset=[colName_mid, colName_smi]).reset_index(drop=True)
    print(f"\tThere are total {dataTable_raw.shape[0]} molecules in the csv with Structure(SMILES)")

    return dataTable_raw, colName_prop_list

##############################################################################################
######################## Prepare .smi and csv for MMPs analysis ##############################
##############################################################################################
def Smiles_Prep(dataTable_raw, colName_mid, colName_smi, colName_prop_list, fileName_out):
    print(f"2. Fragment the SMILES")
    ## the SMILES file for fragmentation
    file_smi = f'{fileName_out}/Compounds_All.smi'
    file_prop_csv = f'{fileName_out}/Property_All.csv'
    delimiter = ' '

    data_dict_prop = {}
    with open(file_smi, "w") as output_file:
        # output_file.write(f'SMILES{delimiter}ID' + "\n")
        for idx in dataTable_raw.index:
            mol_id = dataTable_raw[colName_mid][idx]
            mol_smi = dataTable_raw[colName_smi][idx]

            ## prepare the SMILES output
            this_line = f'{mol_smi}{delimiter}{mol_id}'
            output_file.write(this_line + "\n")  # Add a newline character after each string

            ## prepare the property CSV output
            data_dict_prop[idx] = {}
            data_dict_prop[idx]['ID'] = mol_id

            for prop_name in colName_prop_list:
                try:
                    if dataTable_raw[prop_name].notna()[idx]:
                        mol_prop = float(dataTable_raw[prop_name][idx])
                    else:
                        mol_prop = "*"
                except Exception as e:
                    data_dict_prop[idx][prop_name] = "*"
                    print(f'\t---->Warning! This mol {mol_id} does not have a proper property value: {e}')
                else:
                    data_dict_prop[idx][prop_name] = mol_prop
        print(f'\tThe SMILES strings have been saved into .smi file: {file_smi}')
        
    ## save the csv results
    data_table_prop = pd.DataFrame.from_dict(data_dict_prop).T
    data_table_prop.to_csv(file_prop_csv, index=False, sep=delimiter)
    print(f'\tThe property data have been saved into .csv file: {file_smi}')
    return file_smi, file_prop_csv

##############################################################################################
##################################### Fragment the SMILES ####################################
##############################################################################################
def Smiles_fragmentation(fileName_out, file_smi):
    
    file_fragdb = f'{fileName_out}/Compounds_All.fragdb'
    commandLine = ['mmpdb', 'fragment', file_smi, '-o', file_fragdb]
    process = subprocess.Popen(commandLine, stdout=subprocess.PIPE)
    output, error = process.communicate()
    print(f'\tThe fragmentation is completed and saved into file {file_fragdb}')
    return file_fragdb

##############################################################################################
################## Indexing to find the MMPs and load the activity data ######################
##############################################################################################
def Index_LinkActivity(fileName_out, file_fragdb, file_prop_csv):
    print(f"3. Indexing to find the matched molecular pairs in the fragment file")
    print(f"4. Now load the activity/property data")
    file_mmpdb = f'{fileName_out}/Compounds_All.mmpdb'
    commandLine = ['mmpdb', 'index', file_fragdb, '-o', file_mmpdb, '--properties', file_prop_csv]
    process = subprocess.Popen(commandLine, stdout=subprocess.PIPE)
    output, error = process.communicate()
    print(f'\tThe indexing/mmp generation is completed and saved into file {file_mmpdb}')
    return file_mmpdb

##############################################################################################
############################### Loading data from database ###################################
##############################################################################################
def call_my_query(db_file, my_query):
    ## connect to the SQLIte database
    my_connection = sqlite3.connect(db_file)

    ## create a cursor object
    my_cursor = my_connection.cursor()

    ## excute the query
    my_cursor.execute(my_query)

    ## fetch all the rows
    rows = my_cursor.fetchall()
    
    ## export the results
    data_list = [row for row in rows]

    my_connection.close()
    return data_list

def extract_tables(db_file, table_name):
    ## extract table data from SQLite DB
    my_query_colName = f"PRAGMA table_info({table_name})"
    colName_list = call_my_query(db_file, my_query_colName)

    my_query_data = f"SELECT * FROM {table_name}"
    data_list = call_my_query(db_file, my_query_data)

    ## clean up data
    dataDict = {}
    for row_tuple in data_list:
        idx = row_tuple[0]
        dataDict[idx] = {}

        for col in colName_list:
            colIdx, colName = col[0], col[1]
            dataDict[idx][colName] = row_tuple[colIdx]
    return dataDict
    
##############################################################################################
############################### Loading data from database ###################################
##############################################################################################

In [3]:
##############################################################################################
########################### argument  original csv for MMPs analysis ##############################
##############################################################################################

fileName_in = f'./Data_ADMET_4_MMP_2024Aug27.csv'    ## input CSV
sep = ','

fileName_out = None    # output folder
fileName_out = defineOutputFolder(fileName_out)

colName_mid = 'Compound Name'
colName_smi = 'Smiles'
colNames_activity = 'permeability,fakeCol'

	---->/mnt/data0/Research/5_Automation/mmp/rdkit/Application_MMPsAnalysis/MMPs_results is existing


In [4]:
dataTable_raw, colName_prop_list = CSV_loader(fileName_in, colName_mid, colName_smi, colNames_activity, sep=',')
## 1. Prepare the SMILES file and property CSV file
file_smi, file_prop_csv = Smiles_Prep(dataTable_raw, colName_mid, colName_smi, colName_prop_list, fileName_out)
## 2. Fragment the SMILES
file_fragdb = Smiles_fragmentation(fileName_out, file_smi)
## 3. Indexing to find the MMPs in the fragment file & Load the activity/property data
file_mmpdb = Index_LinkActivity(fileName_out, file_fragdb, file_prop_csv)

1. Loading csv from ./Data_ADMET_4_MMP_2024Aug27.csv
	The original csv file has 25714 rows and 49 columns
	Column for compound ID is Compound Name
	Column for compound SMILES is Smiles
	Columns for compound activity includes ['permeability', 'fakeCol']
	There are total 25714 molecules in the csv with Structure(SMILES)
2. Fragment the SMILES
	The SMILES strings have been saved into .smi file: /mnt/data0/Research/5_Automation/mmp/rdkit/Application_MMPsAnalysis/MMPs_results/Compounds_All.smi
	The property data have been saved into .csv file: /mnt/data0/Research/5_Automation/mmp/rdkit/Application_MMPsAnalysis/MMPs_results/Compounds_All.smi


Preparing record 23916[18:09:31] Can't kekulize mol.  Unkekulized atoms: 6
                                     

	The fragmentation is completed and saved into file /mnt/data0/Research/5_Automation/mmp/rdkit/Application_MMPsAnalysis/MMPs_results/Compounds_All.fragdb
3. Indexing to find the matched molecular pairs in the fragment file
4. Now load the activity/property data


                                                                         

	The indexing/mmp generation is completed and saved into file /mnt/data0/Research/5_Automation/mmp/rdkit/Application_MMPsAnalysis/MMPs_results/Compounds_All.mmpdb


In [None]:
!mmpdb list ./results/Compounds_All.mmpdb

In [None]:
# !mmpdb --help
# !mmpdb help-admin
# !mmpdb index --help

# !mmpdb rulecat --help
# !mmpdb rulecat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_rulecat.csv

# !mmpdb ruleenvcat --help
# !mmpdb ruleenvcat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_ruleenvcat.csv

# !mmpdb propcat --help
# !mmpdb propcat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_propcat.csv

# !mmpdb proprulecat --help
# !mmpdb proprulecat ./results/hERG_All_1956_2024Jun14.mmpdb -o ./results/catfolder/hERG_All_1956_2024Jun14_proprulecat.csv