In [None]:
####################################################################
####################################################################
####################################################################
class desc_calculator_rdkit(object):
    def __init__(self, physChem=True, subStr=True, clean=False):
        self._desc_physChem = physChem
        self._desc_subStr = subStr
        self._desc_clean = clean
        self._desc_list = self.__define_desc_list()

        from rdkit.ML.Descriptors import MoleculeDescriptors
        self._calculator = MoleculeDescriptors.MolecularDescriptorCalculator(self._desc_list)
        # print(f"\tInitiate a RDKit desc calcualtor for {len(self._desc_list)} desc.")

    def __define_desc_list(self):
        from rdkit.Chem import Descriptors

        ## error checking
        assert self._desc_physChem or self._desc_subStr, f"\Error! One of <physChem> or <subStr> should be True."

        # all descriptors (210)
        all_list = [n[0] for n in Descriptors._descList]
        
        ## define descriptor list
        if self._desc_physChem and self._desc_subStr:
            # using all descriptors (210)
            desc_list = all_list         
        elif self._desc_physChem and not self._desc_subStr:
            # only using 125 physicochemical properties
            desc_list = [i for i in all_list if not i.startswith('fr_')]   
        
        elif not self._desc_physChem and self._desc_subStr:
            # only use 85 substructure features <Fraction of a substructure (e.g., 'fr_Al_COO')>
            desc_list = [i for i in all_list if i.startswith('fr_')]

        if self._desc_clean:
            list_rm_prefix = ['BCUT2D_', 'Chi', 'EState_', 'VSA_', 'SlogP_', 'SMR_', 'PEOE_']
            for rm_prefix in list_rm_prefix:
                desc_list = [i for i in desc_list if not i.startswith(rm_prefix)]
        return desc_list

    def calculate(self, smi):
        self.dataDict_results = {}
        try:
            from rdkit import Chem
            mol = Chem.MolFromSmiles(smi)
        except Exception as e:
            print(f'\tThis SMILES cannot be transfer into mol using RDKit: {smi}; Error msg: {e}')
        else:
            try:
                result = self._calculator.CalcDescriptors(mol)
            except Exception as e:
                print(f'\tThis mol cannot be calculated property using RDKit; Error msg: {e}')
            else:
                assert len(self._desc_list) == len(result), f"\tError! Num_calc_desc does not match desc_list"
                
                for i in range(len(self._desc_list)):
                    self.dataDict_results[self._desc_list[i]] = result[i]
        return result


####################################################################
####################################################################
####################################################################
class desc_calculator_morganFPs(object):
    def __init__(self, radius=3, nBits=1024):
        self._radius = int(radius)
        self._nBits = int(nBits)

    def calculate(self, smi):
        self.dataDict_results = {}
        try:
            from rdkit import Chem
            mol = Chem.MolFromSmiles(smi)
        except Exception as e:
            print(f'\tThis SMILES cannot be transfer into mol using RDKit: {smi}; Error msg: {e}')
        else:
            try:
                from rdkit.Chem import AllChem
                result = AllChem.GetMorganFingerprintAsBitVect(mol, radius=self._radius, nBits=self._nBits)
            except Exception as e:
                print(f'\tThis mol cannot be calculated into FPs using RDKit; Error msg: {e}')
            else:
                # dataDict_results['Fps'] = Fps
                for i in range(len(result)):
                    self.dataDict_results[f'FP_bit_{i}'] = result[i]
        return result

   
####################################################################
####################################################################
####################################################################
class desc_calculator_chemaxon(object):
    def __init__(self, version='V22', desc_list=None):
        self._define_cxAPI(version)
        self._desc_list = self._define_desc_list(desc_list)

    ## ==================== define the calculator ====================
    def _define_cxAPI(self, version):
        if version == 'V23':    ## v23.16
            ip = '172.31.19.252'
        elif version == 'V22':    ## v 22.50
            ip = '172.31.25.202'
        else:    ## 22.50
            ip = '172.31.25.202'

        URL_api = f'tttp://{ip}:8064/rest-v1/calculator/calculate' 
        header1 = 'accept: */*'    # header1 = 'accept: application/json'
        header2 = 'Content-Type: application/json'
        self._api = ['curl', '-X', 'POST', URL_api, '-H', header1, '-H', header2]
        return None

    def _define_desc_list(self, desc_list):
        api_param_dict = self._load_api_param_dict()

        if desc_list == 'all' or desc_list is None:
            desc_list = list(api_param_dict.keys())
        elif desc_list == 'basic':
            self._desc_list = ["elemental-analysis", "polar-surface-area", "polar-surface-area"]
        elif desc_list == 'protonation':
            self._desc_list = ["logp", "logd", "charge", "pka"]
        elif desc_list == 'topology':
            self._desc_list = ["topology-analyser"]
        elif desc_list == 'prediction':
            self._desc_list = ["hlb", "bbb", "cns-mpo", "solubility", "herg-activity", "herg-class"]
        else:
            self._desc_list = []
            for desc in desc_list:
                if desc in api_param_dict:
                    desc_param = api_param_dict[desc]
                    self._desc_list.append(f'"{desc}": {desc_param}')
                else:
                    print(f"\t\tWarning, this prop <{desc}> is not in the <calculations dict>")
        return None
    
    ## ==================== run the API call ====================
    def _run_cxAPI(self, mol):
        ## ---------------- prepare dataJson & cmd ----------------
        ## prepare smiles
        from rdkit import Chem
        smi = Chem.MolToSmiles(mol, canonical=True)

        ## prepare the property and calculation paramters
        api_param = ', '.join(self._desc_list)

        ## prepare dataJson
        dataJson = '{"calculations": {%s}, "inputFormat": "smiles", "structure": "%s"}' % (api_param, smi)        

        ## ---------------- run command ----------------
        import subprocess
        self._cmd = self._api + ['-d', str(dataJson)]
        process = subprocess.Popen(self._cmd, stdout=subprocess.PIPE)
        output, error = process.communicate()
        return (output, error)

    ## ==================== decode the calculation output ====================
    def _parse_result(self, result, detailedInfo=False):
        output, error = result[0], result[1]
        dataDict_out = {}
        try:
            import ast
            output_decoded = ast.literal_eval(output.decode())
        except Exception as e:
            print(f'\tCannot decode the output. Error msg: {e}')
        else:
            ## loop all the results in the output and extract them one by one
            for propType in output_decoded:
                ## eliminate un-wanted properties
                if propType not in ["isoelectric-point", "pka-distribution", "major-microspecies"]:
                    ## detect if there is errors for the calculation
                    if "error" in output_decoded[propType]:
                        dataDict_out[propType] = output_decoded[propType]["error"]["message"]
                    
                    elif propType == 'logd':
                        for item in output_decoded[propType]['logDByPh']:
                            pH = item['pH']
                            dataDict_out[f'{propType}[pH={pH}]'] = item['value']
                    
                    elif propType == 'solubility':
                        for item in output_decoded[propType]['phDependentSolubilities']:
                            dataDict_out[f'{propType}[pH={pH}]'] = item['value']
                
                    elif propType == 'pka':
                        ## all pKa value list and atom info
                        for pKa_type in ['acidic', 'basic']:
                            pKa_spec = sorted(output_decoded[propType][f'{pKa_type}ValuesByAtom'], key=lambda x: x['value'], reverse=False)
                            for i_pka in range(min([len(pKa_spec), 2])):
                                pKa_colName = [f'{pKa_type[0]}pKa1', f'{pKa_type[0]}pKa2'][i_pka]
                                dataDict_out[f'pka_{pKa_colName}'] = pKa_spec[i_pka]['value']
                    else:
                        for propName in output_decoded[propType]:
                            ## for the atom level detailed information, by pass if <detailedInfo> flag false
                            if not detailedInfo and 'ByAtom' in propName:
                                pass
                            ## cns/bbb score with components
                            elif propType in ['cns-mpo', 'bbb'] and propName =='properties':
                                for item in output_decoded[propType][propName]:
                                    component_name = item['name']
                                    component_score = item['score']
                                    component_value = item['value']
                                    dataDict_out[f'{propType}_component_{component_name}'] = f'{component_score} ({component_value})'
                            else:
                                dataDict_out[f'{propType}_{propName}'] = output_decoded[propType][propName]            
        return dataDict_out

    ## run calculation
    def calculate(self, smi):
        self.dataDict_results = {}
        try:
            from rdkit import Chem
            mol = Chem.MolFromSmiles(smi)
        except Exception as e:
            print(f'\tThis SMILES cannot be transfer into mol using RDKit: {smi}; Error msg: {e}')
        else:
            try:
                result = self._run_cxAPI(mol)
                dataDict_out =  self._parse_result(result, detailedInfo=False)
            except Exception as e:
                print(f'\tThis mol cannot be calculated property using ChemAxon; Error msg: {e}')
            else:
                assert len(self._desc_list) == len(dataDict_out), f"\tError! calculated desc does not match desc_list"
                for desc in dataDict_out:
                    self.dataDict_results[desc] = dataDict_out[desc]
        return result

    ## define the API calculation parameters
    def _load_api_param_dict(self):
        api_param_dict = {}

        ## --------------- basic ---------------
        api_param_dict["elemental-analysis"] = '{"countAtoms": [1, 6, 8], "countIsotopes": [{"atomNumber": 6, "isotopeNumber": 12}], "operations": "mass, formula", "symbolID": true}'
        api_param_dict["partial-elemental-analysis"] = '{"indexes":[0]}'
        api_param_dict["polar-surface-area"] = '{"excludePhosphorus": true, "excludeSulfur": true, "pH": null}'
        api_param_dict["hbda"] = '{"excludeHalogens": true, "excludeSulfur": true, "outputFormat": "mrv", "outputStructureIncluded": false, "pH": 7.4}'

        ## --------------- protonation ---------------
        api_param_dict["logp"] = '{"atomIncrements": true, "method": "CHEMAXON"}'
        api_param_dict["logd"] = '{"phList": [7.4]}'    # 'logd': '{"phList": [1.5, 5, 6.5, 7.4]}'
        api_param_dict["charge"] = '{"ph": 7.4}'
        api_param_dict["pka-distribution"] = '{"considerTautomerization": true, "pKaLowerLimit": -20, "pKaUpperLimit": 10, "phSequence": {"pHLower": 1.5, "pHStep": 0.1, "pHUpper": 7.4}, "resultMoleculeFormat": "MRV", "temperature": 298}',

        if self._version == 'V23':
            api_param_dict["pka"] = '{"micro": false, "outputFormat": "mrv", "outputStructureIncluded": false, "pKaLowerLimit": -20, "pKaUpperLimit": 10, "prefix": "DYNAMIC", "temperature": 298, "types": "pKa, acidic, basic"}'
        elif self._version == 'V22':
            api_param_dict["pka"] = '{"micro": false, "pKaLowerLimit": -10, "pKaUpperLimit": 20, "prefix": "STATIC", "temperature": 298, "types": "pKa, acidic, basic"}'
        else:
            api_param_dict["pka"] = '{"micro": false, "pKaLowerLimit": -10, "pKaUpperLimit": 20, "prefix": "STATIC", "temperature": 298, "types": "pKa, acidic, basic"}'
            # api_param_dict["pka"] = '{"micro": false, "pKaLowerLimit": -20, "pKaUpperLimit": 10, "prefix": "DYNAMIC", "temperature": 298, "types": "pKa, acidic, basic"}'
        
        ## --------------- topology (ring system) ---------------
        # myOperationTopology = "aromaticRingCount, aromaticRings"
        myOperationTopology = 'fsp3, chainBondCount, rotatableBondCount, aromaticAtomCount, chiralCenterCount, aromaticRingCount, heteroRingCount, fusedAliphaticRingCount, aliphaticRingCount, fusedAromaticRingCount, heteroAromaticRingCount, fusedRingCount, largestRingSystemSize, largestRingSize, ringSystemCount'
        api_param_dict["topology-analyser"] = '{"aliphaticRingSize": 0, "aromaticRingSize": 0, "aromatizationMethod": "GENERAL", "carboRingSize": 0, "fusedAliphaticRingSize": 0, "fusedAromaticRingSize": 0, "heteroAliphaticRingSize": 0, "heteroAromaticRingSize": 0, "heteroRingSize": 0, "ringSize": 0, "ringSystemSize": 0, "operations": "myOperationText", "outputFormat": "mrv"}'
        api_param_dict["topology-analyser"] = api_param_dict['topology-analyser'].replace('myOperationText', myOperationTopology)
       
        ## --------------- prediction ---------------
        api_param_dict["hlb"] = '{}'
        api_param_dict["bbb"] = '{}'
        api_param_dict["cns-mpo"] = '{}'
        api_param_dict["solubility"] = '{"phSequence": {"pHLower": 1.5, "pHStep": 0.1, "pHUpper": 7.4}, "unit": "MM"}'
        api_param_dict["herg-activity"] = '{"outputFormat": "mrv"}'
        api_param_dict["herg-class"] = '{"outputFormat": "mrv"}'


        ## --------------- 3D Conformation ---------------
        # api_param_dict["conformer"] = '{"conformerCount": 5, "diversity": 0.1, "outputFormat": "mrv", "timeLimit": 900}'
        
        ## --------------- others ---------------
        api_param_dict["isoelectric-point"] = '{"pHStep": 0.5}'
        api_param_dict["major-microspecies"] = '{"pH": 7.4, "resultMoleculeFormat": "MRV"}'

        ## --------------- unlicensed ---------------
        # api_param_dict["stereoisomer"] = '{"maxStereoisomerCount": 1000, "outputIn3d": false, "protectDoubleBondStereo": false, "protectTetrahedralStereo": false, "resultMoleculeFormat": "MRV", "type": "TETRAHEDRAL", "verify3d": false}'
        # api_param_dict["tautomerization-canonical"] = '{"normalTautomerGenerationMode": true, "resultMoleculeFormat": "MRV"}'
        # api_param_dict["tautomerization-dominant"] = '{"resultMoleculeFormat": "MRV"}'

        return api_param_dict


In [None]:
def Args_Prepation(parser_desc):
    import argparse
    parser = argparse.ArgumentParser(description=parser_desc)
    
    parser.add_argument('-i', '--input', action="store", default=None, help='The input csv file')
    parser.add_argument('-d', '--delimiter', action="store", default=',', help='The delimiter of input csv file for separate columns')
    parser.add_argument('--detectEncoding', action="store_true", help='detect the encoding type of the csv file')
    parser.add_argument('--colId', action="store", default='Compound Name', help='The column name of the compound identifier')
    parser.add_argument('--colSmi', action="store", default='Structure', help='The column name of the compound smiles')

    parser.add_argument('--desc_fps', action="store_true", help='calculate the molecular fingerprints')
    parser.add_argument('--desc_rdkit', action="store_true", help='calculate the molecular property using RDKit')
    parser.add_argument('--desc_cx', action="store_true", help='calculate the molecular property using ChemAxon')

    args = parser.parse_args()
    return args

In [None]:
def calc_desc_for_table(dataTable, colName_mid, colName_smi, desc_calculator):
    dataDict_desc = {}

    for idx in dataTable.index:
        mid, smi = dataTable[colName_mid][idx], dataTable[colName_smi][idx]
        ## initiate the dict
        if mid not in dataDict_desc:
            dataDict_desc[mid] = {}
            dataDict_desc[mid][colName_mid] = mid
        ## run the calculation
        desc_calculator.calculate(smi)
        if len(desc_calculator.dataDict_results) > 0:
            dataDict_desc[mid].update(desc_calculator.dataDict_results)

    return dataDict_desc

In [None]:
def main():
    '''
    args = Args_Prepation(parser_desc='Preparing the input files and the descriptors')
    fileNameIn = args.input    # '../../1_DataPrep/results/data_input_clean.csv'
    sep = args.delimiter 
    detect_encoding = True if args.detectEncoding else False
    colName_mid = args.colId    # 'Compound Name'
    colName_smi = args.colSmi    # 'Structure'
    desc_fps = True if args.desc_fps else False
    desc_rdkit = True if args.desc_rdkit else False
    desc_cx = True if args.desc_cx else False
    '''
    fileNameIn = '../../1_DataPrep/results/data_input_clean.csv'
    sep =  ','
    colName_mid = 'Compound Name'
    colName_smi = 'Structure'
    desc_fps, desc_rdkit, desc_cx = True, True, True
    print(f"\tCalculating descriptors (FPs {desc_fps}; ChemAxon {desc_rdkit}; RDKit: {desc_cx}) ... ")

    ## ------------ load data ------------
    import pandas as pd
    dataTable_raw = pd.read_csv(fileNameIn, sep=sep)
    assert colName_mid in dataTable_raw.columns, f"\tColumn name for mol ID <{colName_mid}> is not in the table."
    assert colName_smi in dataTable_raw.columns, f"\tColumn name for mol smiles <{colName_smi}> is not in the table."


    ## ------------ calculate rdkit properties ------------
    if desc_rdkit:
        dataDict_rd = {}
        calculator_rd = desc_calculator_rdkit(physChem=True, subStr=True, clean=False)
        for idx in dataTable_raw.index:
            mid, smi = dataTable_raw[colName_mid][idx], dataTable_raw[colName_smi][idx]            

            ## initiate the dict
            if mid not in dataDict_rd:
                dataDict_rd[mid] = {}
                dataDict_rd[mid][colName_mid] = mid

            ## run the calculation
            calculator_rd.calculate(smi)
            if len(calculator_rd.dataDict_results) > 0:
                dataDict_rd[mid].update(calculator_rd.dataDict_results)


                
                


        # molDict = Desc_RDKit.calc_desc_rdkit(molDict=molDict, 
        #                                         physChem=self._desc_rdkit_param["physChem"], 
        #                                         subStr=self._desc_rdkit_param["subStr"], 
        #                                         clean=self._desc_rdkit_param["clean"])

    ## ------------ calculate mol fingerprints ------------
    if desc_fps:
        calculator_fp = desc_calculator_morganFPs(radius=3, nBits=1024)
        
        # self.setAttributes("_desc_fp_param", {"fpType": "ECFP", "radius": 3, "nBits": 2048})
        # molDict = Desc_MolFPs.calc_desc_fingerprints(molDict=molDict, 
        #                                                 fpType=self._desc_fp_param["fpType"], 
        #                                                 radius=self._desc_fp_param["radius"], 
        #                                                 nBits=self._desc_fp_param["nBits"])

    ## ------------ calculate chemAxon properties ------------
    if desc_cx:
        calculator_cx = desc_calculator_chemaxon(version='V22', desc_list='all')
        rmProps = ['polar-surface-area_unit', 'pka_apKa1', 'pka_apKa2', 'pka_bpKa1', 'pka_bpKa2']
        self.setAttributes("_desc_cx_param", {"ip": '172.31.19.252', "port": '8064', "calculator": 'calculate', "rmProps": rmProps})
        molDict = Desc_ChemAxon.calc_desc_chemaxon(molDict = molDict, 
                                                        ip=self._desc_cx_param["ip"], 
                                                        port=self._desc_cx_param["port"], 
                                                        calculator=self._desc_cx_param["calculator"],
                                                        rmProps=self._desc_cx_param["rmProps"])


    ## ------------ update the molDict ------------
    self.setAttributes("_molDict", molDict)

# if __name__ == '__main__':
#     main()