In [1]:
import os
import copy
import numpy as np
import pandas as pd

In [2]:
## load modules needs for this section
import ast
import subprocess
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

##########################################################################################################################################
##########################################################################################################################################
##########################################################################################################################################
class ChemAxonAPI(object):
    ## <----- model initiation ---->
    def __init__(self, ip='172.31.19.252', port='8064', calculator='calculate'):
        
        self._api_url = f'http://{ip}:{port}/rest-v1/calculator/{calculator}'
        self._headers = ['accept: */*', 'Content-Type: application/json']

    ## <----- run api calls and prase the results ---->
    def calculation_from_smi(self, smi, detailedInfo=False):
        ## clean up smiles
        smi_new = self._cleanup_smi(smi)

        ## 1. perpare dataJson using <_generate_dataJson> function for the ChemAxon API calculation
        dataJson = self._generate_dataJson(smi_new)

        ## 2. Define the command you want to execute
        commandLine = ['curl', '-X', 'POST', self._api_url, '-H', self._headers[0], '-H', self._headers[1], '-d', str(dataJson)]

        ## 3. run the cmd using subprocess package to execute the command
        process = subprocess.Popen(commandLine, stdout=subprocess.PIPE)
        self.output, self.error = process.communicate()

        ## 4. clean up the results and
        dataDict_results = {}
        dataDict_out= self._parse_output(detailedInfo=detailedInfo)
        for prop in dataDict_out:
            dataDict_results['cx_'+prop] = dataDict_out[prop]
        return dataDict_results
    
    def calculation_from_mol(self, mol, detailedInfo=False):
        try:
            smi = Chem.MolToSmiles(mol)
        except Exception as e:
            print(f'\tThis mol cannot be generated SMILES using RDKit; Error msg: {e}')
            dataDict_results = {}
        else:
            dataDict_results = self.calculation_from_smi(smi, detailedInfo=detailedInfo)
        return dataDict_results

    ####################### tool function for api calls preparation ########################
    ## <----- api Json preparation ---->
    def _generate_dataJson(self, smi, propList=None):
        ## predefine the dataJson
        calculations = {
            # 'elemental-analysis': '{"countAtoms": [1, 6, 8],  "countIsotopes": [{"atomNumber": 6, "isotopeNumber": 12}], "operations": "mass, formula", "symbolID": true}',
            'elemental-analysis': '{"countAtoms": [1, 6, 8],  "countIsotopes": [{"atomNumber": 6, "isotopeNumber": 12}], "operations": "mass", "symbolID": true}',
            'polar-surface-area': '{"excludePhosphorus": true, "excludeSulfur": true, "outputFormat": "mrv", "outputStructureIncluded": false, "pH": 7.4}',
            'hbda': '{"excludeHalogens": true, "excludeSulfur": true, "outputFormat": "mrv", "outputStructureIncluded": false, "pH": 7.4}',
            'logd': '{"phList": [1.5, 5, 6.5, 7.4]}',
            'logp': '{"atomIncrements": true, "method": "CHEMAXON"}',
            'topology-analyser': '{"aliphaticRingSize": 0, "aromaticRingSize": 0, "aromatizationMethod": "GENERAL", "carboRingSize": 0, "fusedAliphaticRingSize": 0, "fusedAromaticRingSize": 0, "heteroAliphaticRingSize": 0, "ringSize": 0, "heteroAromaticRingSize": 0, "heteroRingSize": 0, "operations": "myOperationText", "outputFormat": "mrv", "ringSystemSize": 0}',
            'charge': '{"ph": 7.4}',
            'pka': '{"micro": false, "outputFormat": "mrv", "outputStructureIncluded": false, "prefix": "STATIC", "pKaLowerLimit": -10, "pKaUpperLimit": 20, "temperature": 298, "types": "pKa, acidic, basic"}',
            'cns-mpo': '{}',
            # 'hlb': '{}', 'bbb': '{}', 'cns-mpo': '{}',
            # 'pka-distribution': '{"considerTautomerization": true, "pKaLowerLimit": -20, "pKaUpperLimit": 10, "temperature": 298, "phSequence": {"pHLower": 1.5, "pHStep": 0.1, "pHUpper": 7.4}, "resultMoleculeFormat": "MRV"}',
            # 'solubility': '{"phSequence": {"pHLower": 1.5, "pHStep": 0.1, "pHUpper": 7.4}, "unit": "MM"}'
        }
        myOperationTopology = 'fsp3, chainBondCount, rotatableBondCount, aromaticAtomCount, chiralCenterCount, aromaticRingCount, heteroRingCount, fusedAliphaticRingCount, aliphaticRingCount, fusedAromaticRingCount, heteroAromaticRingCount, fusedRingCount, largestRingSystemSize, largestRingSize, ringSystemCount'
        calculations['topology-analyser'] = calculations['topology-analyser'].replace('myOperationText', myOperationTopology)

        ## based on the query calculation, prepare the calculators (string)
        dataList_calculators = []
        for prop in calculations:
            prop_param = calculations[prop]
            dataList_calculators.append(f'"{prop}": {prop_param}')
        
        ## prepare the dataJson string for API calls
        dataJson = '{"calculations": {%s}, "inputFormat": "smiles", "structure": "%s"}' % (', '.join(dataList_calculators), smi)
        return dataJson

    def _cleanup_smi(self, smi):
        if "\\" in smi:
            print(f'\tThere is a "\\" in the SMILES {smi}')
            smi = smi.replace('\\', '\\\\')
            print(f'\tAdd 1 "\\" into the SMILES, now new SMILES is {smi}')
        return smi

    def _parse_output(self, detailedInfo=False):
        dataDict_out = {}
        try:
            output_decoded = ast.literal_eval(self.output.decode())
        except Exception as e:
            print(f'\tCannot decode the output. Error msg: {e}')
        else:
            ## loop all the results in the output and extract them one by one
            for propType in output_decoded:
                ## eliminate un-wanted properties
                if propType not in ["isoelectric-point", "pka-distribution", "major-microspecies"]:
                    ## detect if there is errors for the calculation
                    if "error" in output_decoded[propType]:
                        dataDict_out[propType] = output_decoded[propType]["error"]["message"]
                    
                    elif propType == 'logd':
                        for item in output_decoded[propType]['logDByPh']:
                            pH = item['pH']
                            dataDict_out[f'{propType}[pH={pH}]'] = item['value']
                    
                    elif propType == 'solubility':
                        for item in output_decoded[propType]['phDependentSolubilities']:
                            dataDict_out[f'{propType}[pH={pH}]'] = item['value']
                
                    elif propType == 'pka':
                        ## all pKa value list and atom info
                        for pKa_type in ['acidic', 'basic']:
                            pKa_spec = sorted(output_decoded[propType][f'{pKa_type}ValuesByAtom'], key=lambda x: x['value'], reverse=False)
                            for i_pka in range(min([len(pKa_spec), 2])):
                                pKa_colName = [f'{pKa_type[0]}pKa1', f'{pKa_type[0]}pKa2'][i_pka]
                                dataDict_out[f'pka_{pKa_colName}'] = pKa_spec[i_pka]['value']

                    else:
                        for propName in output_decoded[propType]:
                            ## for the atom level detailed information, by pass if <detailedInfo> flag false
                            if not detailedInfo and 'ByAtom' in propName:
                                pass
                            ## cns/bbb score with components
                            elif propType in ['cns-mpo', 'bbb'] and propName =='properties':
                                for item in output_decoded[propType][propName]:
                                    component_name = item['name']
                                    component_score = item['score']
                                    component_value = item['value']
                                    dataDict_out[f'{propType}_component_{component_name}'] = f'{component_score} ({component_value})'
                            else:
                                dataDict_out[f'{propType}_{propName}'] = output_decoded[propType][propName]            
        return dataDict_out

##########################################################################################################################################
##########################################################################################################################################
##########################################################################################################################################
class morgan_Fps_calculator(object):
    ## <----- model initiation ---->
    def __init__(self, radius=3, nBits=1024):
        self._radius = int(radius)
        self._nBits = int(nBits)

    def calculation_from_mol(self, mol):
        try:
            Fps = AllChem.GetMorganFingerprintAsBitVect(mol, radius=self._radius, nBits=self._nBits)
        except Exception as e:
            print(f'\tThis mol cannot be calculated into FPs using RDKit; Error msg: {e}')
            dataDict_results = None
        else:
            dataDict_results = {}
            dataDict_results['Fps'] = Fps
            for i in range(len(Fps)):
                dataDict_results[f'FP_bit_{i}'] = Fps[i]
        return dataDict_results

    def calculation_from_smi(self, smi):
        try:
            mol = Chem.MolFromSmiles(self._cleanup_smi(smi))
        except Exception as e:
            print(f'\tThis SMILES cannot be transfer into mol using RDKit: {smi}; Error msg: {e}')
            dataDict_results = None
        else:
            dataDict_results = self.calculation_from_mol(mol)
        return dataDict_results
    
    def _cleanup_smi(self, smi):
        if "\\" in smi:
            print(f'\tThere is a "\\" in the SMILES {smi}')
            smi = smi.replace('\\', '\\\\')
            print(f'\tAdd 1 "\\" into the SMILES, now new SMILES is {smi}')
        return smi

##########################################################################################################################################
##########################################################################################################################################
##########################################################################################################################################
class RDKit_desc_calculator(object):
    ## <----- model initiation ---->
    def __init__(self, physChem=True, subStr=True, clean=False):        
        self._desc_list = self.__define_desc_list(physChem=physChem, subStr=subStr, clean=clean)
        self._desc_calc = MoleculeDescriptors.MolecularDescriptorCalculator(self._desc_list)
        # print(f"\tInitiate a RDKit desc calcualtor for {len(self._desc_list)} desc.")

    def calculation_from_mol(self, mol):
        try:
            rdkit_desc = self._desc_calc.CalcDescriptors(mol)
        except Exception as e:
            print(f'\tThis mol cannot be calculated property using RDKit; Error msg: {e}')
            dataDict_results = None
        else:
            assert len(self._desc_list) == len(rdkit_desc), f"\tError! Num_calc_desc does not match desc_list"
            dataDict_results = {}
            for i in range(len(self._desc_list)):
                dataDict_results[self._desc_list[i]] = rdkit_desc[i]
        return dataDict_results

    def calculation_from_smi(self, smi):
        try:
            mol = Chem.MolFromSmiles(self._cleanup_smi(smi))
        except Exception as e:
            print(f'\tThis SMILES cannot be transfer into mol using RDKit: {smi}; Error msg: {e}')
            dataDict_results = None
        else:
            dataDict_results = self.calculation_from_mol(mol)
        return dataDict_results    
    
    def __define_desc_list(self, physChem=True, subStr=True, clean=False):
        ## error checking
        assert physChem or subStr, f"\Error! One of <physChem> or <subStr> should be True."

        # all descriptors (210)
        all_list = [n[0] for n in Descriptors._descList]
        
        ## define descriptor list
        if physChem and subStr:
            # using all descriptors (210)
            desc_list = all_list         
        elif physChem and not subStr:
            # only using 125 physicochemical properties
            desc_list = [i for i in all_list if not i.startswith('fr_')]   
        
        elif not physChem and subStr:
            # only use 85 substructure features <Fraction of a substructure (e.g., 'fr_Al_COO')>
            desc_list = [i for i in all_list if i.startswith('fr_')]

        if clean:
            list_rm_prefix = ['BCUT2D_', 'Chi', 'EState_', 'VSA_', 'SlogP_', 'SMR_', 'PEOE_']
            for rm_prefix in list_rm_prefix:
                desc_list = [i for i in desc_list if not i.startswith(rm_prefix)]
        return desc_list

    def _cleanup_smi(self, smi):
        if "\\" in smi:
            print(f'\tThere is a "\\" in the SMILES {smi}')
            smi = smi.replace('\\', '\\\\')
            print(f'\tAdd 1 "\\" into the SMILES, now new SMILES is {smi}')
        return smi

In [3]:
## load modules needs for this section
import chardet
from rdkit import Chem

##
class AssayData4ML(object):
    def __init__(self, dataName="myData"):
        self._name = dataName
        self._dataTableRaw = None
        self._molDict = None

    ## load data from CSV
    def load_csv(self, fileNameIn, sep=",", usecols=None, colName_id="Compound Name", colName_smi="Smiles"):
        assert os.path.exists(fileNameIn), f"File {fileNameIn} does not exist"
        self.__setAttributes("_fileNameIn", fileNameIn)
        try:
            ## determine the encoding
            encoding = self.__determine_input_file_encoding(default='utf-8')
            ## read csv file
            dataTable = pd.read_csv(fileNameIn, sep=sep, usecols=usecols, encoding=encoding)
            print(f"Loading data with {dataTable.shape[0]} rows and {dataTable.shape[1]} cols from file {fileNameIn}")            
        except Exception as e:
            print(f"Can not read cvs file, error: {e}")
        else:
            self.__setAttributes("_dataTableRaw", dataTable)
            molDict = self.__extract_mol_data(colName_id=colName_id, colName_smi=colName_smi)
            self.__setAttributes("_molDict", molDict)
        
    ## calculate descriptors from smiles
    def calc_desc(self, desc_fps=True, desc_rdkit=True, desc_cx=True):
        assert self._molDict is not None, f"\tError, self.__molDict is None, pls check the data loading from csv."
        ## calculate mol fingerprints
        if desc_fps:
            fpType, radius, nBits = "ECFP", 3, 2048            
            molDict = self.__calc_desc_fingerprints(fpType=fpType, radius=radius, nBits=nBits)
            self.__setAttributes("_molDict", molDict)
        ## calculate rdkit properties            
        if desc_rdkit:
            physChem, subStr, clean = True, True, True
            molDict = self.__calc_desc_rdkit(physChem=physChem, subStr=subStr, clean=clean)
            self.__setAttributes("_molDict", molDict)
        ## calculate chemAxon properties
        if desc_cx:
            ip, port, calculator = '172.31.19.252', '8064', 'calculate'
            molDict = self.__calc_desc_chemaxon(ip=ip, port=port, calculator=calculator)
            self.__setAttributes("_molDict", molDict)

    ## prepare the dataset, with a table, [cols_ds, cols_, cols_desc, cols_y]
    def prep_dataset(self, desc_fps=True, desc_rdkit=True, desc_cx=True):
        assert self._dataTableRaw is not None, f"\tError, self._dataTableRaw is None, pls check the data loading from csv."
        dataTable = self._dataTableRaw
        dataTable['DataSet'] = dataTable['cv_id'].apply()

        dataDict = {}
        for cid in self._molDict:
            idx_list = self._molDict[cid]['idx_list']
            fps = self._molDict[cid]['desc_fps']
            prop_rdkit = self._molDict[cid]['desc_rdkit']
            prop_cx = self._molDict[cid]['desc_cx']
        return dataTable

    
    ## separate training/test/validation set
    def train_val_test_split(self):
        
        splitDict = {}
        self.__setAttributes("_splitDict", )
        self.__temporalSplit(self, colName_date="Created On", CV=5)
        return 1

    ## ================================================
    ## ============ tools for loading data ============
    ## ================================================
    def __determine_input_file_encoding(self, default='utf-8'):
        fileNameIn = self._fileNameIn
        try:
            # Step 1: Open the file in binary mode
            with open(fileNameIn, 'rb') as f:
                data = f.read()
            
            # Step 2: Detect the encoding using the chardet library
            encoding_result = chardet.detect(data)

            # Step 3: Retrieve the encoding information
            encoding = encoding_result['encoding']
        except Exception as e:
            print(f"Can not detect encoding, error {e}")
            encoding = default
        else:
            if encoding != default:
                print(f"Using Encoding <{encoding}>")
        return encoding

    ##
    def __extract_mol_data(self, colName_id="Compound Name", colName_smi="Smiles"):
        dataTable = self._dataTableRaw
        dataDict_mol = {}

        ## Error/Warning checking
        if colName_id not in dataTable.columns:
            print(f"\tWarning! col <{colName_id}> is not in the table, using <molecule + row index> as ID.")
            colName_id = 'rowIndex_tmp'
            dataTable[colName_id] =  'molecule_' + dataTable.index.astype(str)
        self.__setAttributes("_colNameId", colName_id)
        ## 
        assert colName_smi in dataTable.columns, f"\tError! col <{colName_smi}> is not in the table."
        self.__setAttributes("_colNameSmi", colName_smi)

        ## loop the data and extract mol info
        for idx in dataTable.index:
            if dataTable[colName_id].notna()[idx]:
                cid = dataTable[colName_id][idx]
                if cid not in dataDict_mol:
                    dataDict_mol[cid] = {}
                    dataDict_mol[cid]['idx_list'] = []

                ## add row info
                dataDict_mol[cid]['idx_list'].append(idx)
                
                ## add structure info
                if dataTable[colName_smi].notna()[idx]:
                    smi = dataTable[colName_smi][idx]
                    smi = self._cleanup_smi(smi)
                    ## warning of mismatched smiles
                    if 'Smiles' in dataDict_mol[cid]:
                        if dataDict_mol[cid]['Smiles'] != smi:
                            print(f"\tWarning! This mol {cid} has multiple different values in <{colName_smi}> column")
                    dataDict_mol[cid]['Smiles'] = smi
                    ## SMILES Canonicalization
                    try:
                        smi_clean = Chem.MolToSmiles(Chem.MolFromSmiles(smi))
                    except Exception as e:
                        print(f"\tWarning! The SMILES of mol {cid} can not be Canonicalized by RDKit")
                    else:    
                        dataDict_mol[cid]['Smiles_clean'] = smi_clean
                else:
                    ## warning of empty smiles
                    print(f"\tWarning! This mol {cid} has no value in <{colName_smi}> column")
            else:
                ## warning of empty cid
                print(f"\tWarning! This row {idx} has empty value in <{colName_id}> column")

        print(f"\tTotal {len(dataDict_mol)} mols were extracted.")
        if len(dataDict_mol) != dataTable.shape[0]:
            print(f"\tWarning! The number of mols is not match the number of rows/entries in the table {dataTable.shape[0]}. Please double check")
        return dataDict_mol
    
    ## ==========================================================
    ## ============ tools for calculater descriptors ============
    ## ==========================================================    
    ## calc mol FPs
    def __calc_desc_fingerprints(self, fpType="ECFP", radius=3, nBits=2048):
        self.__setAttributes("_desc_fp_param", {"fpType": fpType, "radius": radius, "nBits": nBits})

        ## initiate a fps calculator
        if fpType == "ECFP":
            fpsCalculator = morgan_Fps_calculator(radius=radius, nBits=nBits)
        else:
            print(f"\tWarning! Current version only support ECFP. Now generating the default ECFP{radius*2} ({nBits}bits)")
            fpsCalculator = morgan_Fps_calculator(radius=radius, nBits=nBits)

        ## loop through the mol list and calculate the fps
        molDict = self._molDict
        print(f'\t----------- Now start calculating Molecular Fingerprints ----------')
        for cid in molDict:
            molDict[cid]['desc_fps'] = {}
            smi = molDict[cid]['Smiles_clean'] if 'Smiles_clean' in molDict[cid] else molDict[cid]['Smiles']
            try:
                dataDict_fps = fpsCalculator.calculation_from_smi(smi)
            except Exception as e:
                print(f"\tWarning, the mol <{cid}> fails to calculate molecular fingerprints. Error: {e}")
            else:
                molDict[cid]['desc_fps'].update(dataDict_fps)
        print(f'\t----------- Molecular Fingerprints calculation done ----------')
        return molDict

    ## calc RDKit property
    def __calc_desc_rdkit(self, physChem=True, subStr=True, clean=False):
        self.__setAttributes("_desc_rdkit_param", {"physChem": physChem, "subStr": subStr, "clean": clean})

        ## initiate a rdkit calculator
        rdCalculator = RDKit_desc_calculator(physChem=physChem, subStr=subStr, clean=clean)

        ## loop through the mol list and calculate the rdkit  props
        molDict = self._molDict
        print(f'\t----------- Now start calculating RDKit props ----------')
        for cid in molDict:
            molDict[cid]['desc_rdkit'] = {}
            smi = molDict[cid]['Smiles_clean'] if 'Smiles_clean' in molDict[cid] else molDict[cid]['Smiles']
            try:
                descDict_rdkit = rdCalculator.calculation_from_smi(smi)
            except Exception as e:
                print(f"\tWarning, the mol <{cid}> fails to calculate RDKit property. Error: {e}")
            else:
                molDict[cid]['desc_rdkit'].update(descDict_rdkit)
        print(f'\t----------- RDKit props calculation done ----------')
        return molDict
    
    ## calc ChemAxon property
    def __calc_desc_chemaxon(self, ip='172.31.19.252', port='8064', calculator='calculate'):
        self.__setAttributes("_desc_cx_param", {"ip": ip, "port": port, "calculator": calculator})

        ## initiate a ChemAxonAPI object
        cxAPI = ChemAxonAPI(ip='172.31.19.252', port='8064', calculator='calculate')

        ## loop through the mol list and calculate the properties
        molDict = self._molDict
        print(f'\t----------- Now start calculating ChemAxon Property ----------')
        for cid in molDict:
            molDict[cid]['desc_cx'] = {}
            smi = molDict[cid]['Smiles_clean'] if 'Smiles_clean' in molDict[cid] else molDict[cid]['Smiles']
            try:
                descDict_cx = cxAPI.calculation_from_smi(smi)
            except Exception as e:
                print(f"\tWarning, the mol <{cid}> fails to calculate ChemAxon property. Error: {e}")
            else:
                molDict[cid]['desc_cx'].update(descDict_cx)
        print(f'\t----------- ChemAxon Property calculation done ----------')
        return molDict

    ## ================================================
    ## ============ tools for splitting data ============
    ## ================================================
    def __nFoldSplit(dataTable, colName_based=["column_name_list_here"], CV=5):
        assert dataTable.shape[0] >= 5, f"N_rows ({dataTable.shape[0]}) is less than N_folds ({CV})"
        ## sort by re-ordered-index
        dataTable = dataTable.sort_values(by=colName_based).reset_index(names=['index_tmp'])
        dataTable[f'cv_id'] = pd.qcut(dataTable.index, q=CV, labels=False)
        dataTable = dataTable.sort_values(by=["index_tmp"], ascending=[True]).reset_index(drop=True).drop(columns=['index_tmp'])
        return dataTable

    def __nFoldSplit_temporal(self, colName_date="Created On", CV=5):
        assert colName_date in self._dataTableRaw.columns, f"Error! Column <{colName_date}> is not in self._dataTableRaw"
        dataTable_split = self._dataTableRaw.loc[:, [self._colNameId, colName_date]]
        dataTable_split["date_formatted"] = pd.to_datetime(dataTable_split[colName_date])
        dataTable_split = __nFoldSplit(dataTable_split, colName_based=["date_formatted", self._colNameId], CV=CV)
        dataDict_split = dataTable_split.to_dict(orient="index")
        return dataDict_split
    
    def __nFoldSplit_random(self, CV=5, rng=666666):
        dataTable_split = self._dataTableRaw.loc[:, [self._colNameId]]
        dataTable_split = dataTable_split.sample(frac=1, random_state=rng).reset_index(names=['index_old'])
        dataTable_split = __nFoldSplit(dataTable_split, colName_based=[], CV=CV)
        dataTable_split = dataTable_split.sort_values(by=["index_old"], ascending=[True]).reset_index(drop=True).drop(columns=['index_old'])
        dataDict_split = dataTable_split.to_dict(orient="index")
        return dataDict_split


    ## ==========================================================
    ## ============ other tools ============
    ## ==========================================================
    ## clean up smiles
    def _cleanup_smi(self, smi):
        if "\\" in smi:
            print(f'\tThere is a "\\" in the SMILES {smi}')
            smi = smi.replace('\\', '\\\\')
            print(f'\tAdd 1 "\\" into the SMILES, now new SMILES is {smi}')
        return smi

    ## set attributes with values outside of __init__
    def __setAttributes(self, attrName, attrValue):
        setattr(self, attrName, attrValue)



In [27]:
testData._molDict

{'KT-0032100': {'idx_list': [0],
  'Smiles': 'N1(CCSCC1)c3ccn2ncc(c2n3)C(=O)Nc4c(nn(c4)[C@H]9CC[C@H](CN5CCC(CC5)OCC#Cc6cccc7c6N(C)C(=O)N7C8CCC(=O)NC8=O)CC9)C(F)F',
  'Smiles_clean': 'Cn1c(=O)n(C2CCC(=O)NC2=O)c2cccc(C#CCOC3CCN(C[C@H]4CC[C@H](n5cc(NC(=O)c6cnn7ccc(N8CCSCC8)nc67)c(C(F)F)n5)CC4)CC3)c21',
  'desc_fps': {'Fps': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f3649c0e9d0>,
   'FP_bit_0': 0,
   'FP_bit_1': 1,
   'FP_bit_2': 0,
   'FP_bit_3': 0,
   'FP_bit_4': 0,
   'FP_bit_5': 0,
   'FP_bit_6': 0,
   'FP_bit_7': 0,
   'FP_bit_8': 0,
   'FP_bit_9': 0,
   'FP_bit_10': 1,
   'FP_bit_11': 0,
   'FP_bit_12': 0,
   'FP_bit_13': 0,
   'FP_bit_14': 0,
   'FP_bit_15': 0,
   'FP_bit_16': 0,
   'FP_bit_17': 0,
   'FP_bit_18': 0,
   'FP_bit_19': 0,
   'FP_bit_20': 0,
   'FP_bit_21': 0,
   'FP_bit_22': 0,
   'FP_bit_23': 0,
   'FP_bit_24': 0,
   'FP_bit_25': 0,
   'FP_bit_26': 0,
   'FP_bit_27': 0,
   'FP_bit_28': 0,
   'FP_bit_29': 1,
   'FP_bit_30': 0,
   'FP_bit_31': 0,
   'FP_bit_

In [7]:
testData.__dict__.keys()

dict_keys(['_name', '_dataTableRaw', '_molDict', '_fileNameIn', '_colNameId', '_colNameSmi', '_desc_fp_param', '_desc_rdkit_param'])

In [13]:
dataTable_tmp = testData._dataTableRaw.loc[:, [testData._colNameId]]
dataTable_tmp['cv_id_tmp'] = dataTable_tmp[testData._colNameId].str[-1]
dataTable_tmp.head()

Unnamed: 0,Compound Name,cv_id_tmp
0,KT-0032100,0
1,KT-0032109,9
2,KT-0032267,7
3,KT-0034165,5
4,KT-0090865,5


In [25]:
dataTable_tmp.to_dict(orient="index")

{0: {'Compound Name': 'KT-0032100', 'cv_id_tmp': '0'},
 1: {'Compound Name': 'KT-0032109', 'cv_id_tmp': '9'},
 2: {'Compound Name': 'KT-0032267', 'cv_id_tmp': '7'},
 3: {'Compound Name': 'KT-0034165', 'cv_id_tmp': '5'},
 4: {'Compound Name': 'KT-0090865', 'cv_id_tmp': '5'},
 5: {'Compound Name': 'KT-0194980', 'cv_id_tmp': '0'},
 6: {'Compound Name': 'KT-0194981', 'cv_id_tmp': '1'},
 7: {'Compound Name': 'KT-0194986', 'cv_id_tmp': '6'},
 8: {'Compound Name': 'KT-0194988', 'cv_id_tmp': '8'},
 9: {'Compound Name': 'KT-0194990', 'cv_id_tmp': '0'},
 10: {'Compound Name': 'KT-0194991', 'cv_id_tmp': '1'},
 11: {'Compound Name': 'KT-0194992', 'cv_id_tmp': '2'},
 12: {'Compound Name': 'KT-0194993', 'cv_id_tmp': '3'},
 13: {'Compound Name': 'KT-0194995', 'cv_id_tmp': '5'},
 14: {'Compound Name': 'KT-0194996', 'cv_id_tmp': '6'},
 15: {'Compound Name': 'KT-0194997', 'cv_id_tmp': '7'},
 16: {'Compound Name': 'KT-0034843', 'cv_id_tmp': '3'},
 17: {'Compound Name': 'KT-0039737', 'cv_id_tmp': '7'},
 1

In [23]:
dataTable_tmp[dataTable_tmp['cv_id_tmp']=='1'].index.to_list()

[6, 10, 21]

In [4]:
testData = AssayData4ML(dataName='myTestData')
fileName_input = "./test.csv"
testData.load_csv(fileNameIn=fileName_input, sep=",", usecols=None, colName_id="Compound Name", colName_smi="Structure")
testData.calc_desc(desc_fps=True, desc_cx=False, desc_rdkit=True)

Using Encoding <ascii>
Loading data with 27 rows and 38 cols from file ./test.csv
	Total 27 mols were extracted.
	----------- Now start calculating Molecular Fingerprints ----------
	----------- Molecular Fingerprints calculation done ----------
	----------- Now start calculating RDKit props ----------
	----------- RDKit props calculation done ----------


In [None]:
testData._molDict

In [None]:
colName_outcome = "ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Num)"