In [None]:
# conda env: pyg (Python 3.9.16)
"""
build a benchmarking pipeline for machine learning models and save the results to result.csv
"""
import os
import sys
import warnings
from tqdm import tqdm
import joblib
import argparse

# inner modules
from datacat4ml.const import *
from datacat4ml.Scripts.model_dev.data_process import Data
from datacat4ml.Scripts.model_dev.ml import RFC, RFR, SVRR, SVCC, KNNR, KNNC, GBC, GBR
from datacat4ml.Scripts.model_dev.metrics import *
from datacat4ml.Scripts.model_dev.tune_alpha_low import get_config
from datacat4ml.Scripts.utils import mkdirs

#=========================Benchmarking==========================
algo4reg = [RFR, SVRR, KNNR, GBR]
algo4cls = [RFC, SVCC, KNNC, GBC]

In [None]:
def write_results(result_file, data, mcc=None, bedroc_dec5=None, bedroc_2=None, bedroc_8=None, rmse=None, cliff_rmse=None, r2=None, cliff_r2=None,
                  file_path: File_paths=DATASETS_DIR, task: str='cls', confidence_score: int=8, thr_class: int=6, use_clustering: bool=True, use_smote: bool=True, 
                  descriptor: str='ECFP4', algoname: Algos=RFC):
    
    """
    Write benchmarking results to a file
    """

    output_dir = RESULTS_ASSAYWISE_DIR
    mkdirs(output_dir)
    result_path= os.path.join(output_dir, result_file)

    if file_path == DATASETS_DIR:
        file_path_name = 'categorized'
    elif file_path == FETCH_DATA_DIR:
        file_path_name = 'merged'

 
    # Create output file if it doesn't exist already
    if not os.path.isfile(result_path):
        with open(result_path, 'w') as f:
            f.write('file_path,task,confidence_score,thr_class,use_clustering,use_smote,'
                    'target,effect,assay,std_type,descriptor,algo,'
                    'n_compounds,n_cliff_compounds,n_compounds_train,n_cliff_compounds_train,n_compounds_test,n_cliff_compounds_test,'
                    'mcc,bedroc_dec5,bedroc_2,bedroc_8,rmse,cliff_rmse,r2,cliff_r2\n')
            
    with open(result_path, 'a') as f:
        f.write(f'{file_path_name},{task},{confidence_score},{thr_class},{use_clustering},{use_smote},'
                f'{data.target},{data.effect},{data.assay},{data.std_type},{descriptor},{algoname},'
                f'{len(data.y_train)+len(data.y_test)},{sum(data.cliff_mols_train)+sum(data.cliff_mols_test)},'
                f'{len(data.y_train)},{sum(data.cliff_mols_train)},{len(data.y_test)},{sum(data.cliff_mols_test)},'
                f'{mcc},{bedroc_dec5},{bedroc_2},{bedroc_8},{rmse},{cliff_rmse},{r2},{cliff_r2} \n')

In [None]:

def benchmark_result(result_file: str = "results.csv", file_path: File_paths=DATASETS_DIR, task: str = 'cls', 
                     confidence_score: int=8, thr_class: int=6, 
                     use_clustering: int=1, use_smote: int=1, descriptor: str='ECFP4'):
    
    use_clustering = bool(use_clustering)
    use_smote = bool(use_smote)

    print(f"file_path: {file_path}\n")
    print(f"task: {task}\n")
    print(f"confidence_score: {confidence_score}\n")
    print(f"thr_class: {thr_class}\n")
    print(f"use_clustering: {use_clustering}\n")

    file_folder = os.path.join(file_path, 'assaywise_splited', task, 'confidence_score'+'_'+str(confidence_score), 
                                'thr_class'+'_'+str(thr_class), 'use_clustering' +'_'+str(use_clustering))
    print(f"file_folder is {file_folder}")

    filenames = os.listdir(file_folder)
    print(f"filenames is {filenames}")
    for filename in tqdm(filenames):
        print(f"file: {filename}\n")

        print(f"use_smote: {use_smote}\n")
        if task == 'reg':
            use_smote = False
            algos = algo4reg
        elif task == 'cls':
            use_smote = use_smote
            algos = algo4cls
        
        # create a Data object
        try:
            data = Data(file_folder, filename, task, use_smote)


            print(f"descriptor: {descriptor}\n")

            # Featurize SMILES strings with the given descriptor
            data.featurize_data(descriptor)            
            
            if task == 'cls' and use_smote:
                data.balance_data()
                data.shuffle()
            else:
                data.shuffle()

            for algo in algos:
                print(f"algo: {algo.__name__}\n")
                config_path = os.path.join(BEST_CONFIG_ASSAYWISE_DIR, task, 'confidence_score'+'_'+str(confidence_score), 
                                        'thr_class'+'_'+str(thr_class), 'use_clustering' +'_'+str(use_clustering), 
                                        'use_smote'+'_'+str(use_smote), filename[:-10], f"{algo.__name__}_{descriptor}.yml")
                model_path = os.path.join(MODELS_ASSAYWISE_DIR, task, 'confidence_score'+'_'+str(confidence_score), 
                                            'thr_class'+'_'+str(thr_class), 'use_clustering' +'_'+str(use_clustering), 
                                            'use_smote'+'_'+str(use_smote), filename[:-10], f"{algo.__name__}_{descriptor}.joblib")
                if not os.path.isdir(os.path.dirname(model_path)):
                    os.makedirs(os.path.dirname(model_path))

                try:
                    # Get the best hyperparmeters stored in the config file
                    print(f"read best config ...")
                    best_config = get_config(config_path)
                    print('Done')

                    # Train the model with the best hyperparameters
                    print(f"train model ...")
                    f = algo(task, **best_config)

                    if data.x_smote_train is not None:
                        print(f"smote is used")
                        f.train(data.x_smote_train, data.y_smote_train)
                    else:
                        print(f"smote is not used") 
                        f.train(data.x_train, data.y_train)
                    print('Done')

                    # Save the model
                    print(f"save model ...")
                    with open(model_path, 'wb') as handle:
                        joblib.dump(f, handle)
                    print('Done')

                    # Evaluate the model
                    print(f"evaluate model ...")
                    y_pred = f.predict(data.x_test)
                    if task == 'cls':
                        mcc = calc_mcc(data.y_test, y_pred)
                        y_pred_proba = f.predict_proba(data.x_test)
                        bedroc_dec5 = calc_bedroc(y_pred_proba=y_pred_proba, y_true=data.y_test, alpha=321.9)
                        bedroc_2 = calc_bedroc(y_pred_proba=y_pred_proba, y_true=data.y_test, alpha=80.5)
                        bedroc_8 = calc_bedroc(y_pred_proba=y_pred_proba, y_true=data.y_test, alpha=20.0)

                        r2, cliff_r2, rmse, cliff_rmse = None, None, None, None

                    elif task == 'reg':
                        r2 = calc_r2(data.y_test, y_pred)
                        cliff_r2 = calc_cliff_r2(y_test_pred=y_pred, y_test=data.y_test,
                                                cliff_mols_test=data.cliff_mols_test)
                        rmse = calc_rmse(data.y_test, y_pred)
                        cliff_rmse = calc_cliff_rmse(y_test_pred=y_pred, y_test=data.y_test,
                                                        cliff_mols_test=data.cliff_mols_test)
                        mcc, bedroc_dec5, bedroc_2, bedroc_8 = None, None, None, None
                    print('Done')
                    
                    # Write the results to a csv file
                    print(f"write results ...")
                    write_results(result_file=result_file, data=data, mcc=mcc, bedroc_dec5=bedroc_dec5, bedroc_2=bedroc_2,bedroc_8=bedroc_8, rmse=rmse, cliff_rmse=cliff_rmse, r2=r2, cliff_r2=cliff_r2,
                                file_path=file_path, task=task, confidence_score=confidence_score, thr_class=thr_class, use_clustering=use_clustering, use_smote=use_smote, 
                                descriptor=descriptor, algoname=algo.__name__)
                    
                    print("Done")
                    # check the results by loading it as a pandas dataframe
                
                    print('######################')

                    
                except:
                        warnings.warn(f" -- FAILED {filename}, {task}, use_smote_{use_smote}, {algo.__name__}-{descriptor}")
                    
        except Exception as e:
            warnings.warn(f" -- FAILED to create Data object for {filename}: {e} --")



In [None]:
# ============================== main ==============================
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Model building and benchmarking')
    parser.add_argument('--result_file', type=str, required=True, help='path to the result file')
    parser.add_argument('--file_path', type=str, required=True, help='path to the data folder')
    parser.add_argument('--task', type=str, required=True, help='task: cls or reg')
    parser.add_argument('--confidence_score', type=int, required=True, help='confidence score')
    parser.add_argument('--thr_class', type=int, required=True, help='threshold for class')
    parser.add_argument('--use_clustering', type=int, required=True, help='use clustering or not')
    parser.add_argument('--use_smote', type=int, required=True, help='use smote or not')
    parser.add_argument('--descriptor', type=str, required=True, help='descriptor')

    args = parser.parse_args()

    benchmark_result(result_file=args.result_file,
                     file_path=args.file_path,
                     task=args.task,
                     confidence_score=args.confidence_score,
                     thr_class=args.thr_class,
                     use_clustering=args.use_clustering,
                     use_smote=args.use_smote,
                     descriptor=args.descriptor)