In [2]:
# conda env: twinbooster (Python 3.8.20). `pip install twinbooster` requires Python 3.8
import os
import sys

from datacat4ml.const import *
from datacat4ml.utils import mkdirs
from datacat4ml.Scripts.model_dev.metrics import *

import pandas as pd
import twinbooster

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
## twinbooster.download_models() # download the pre-trained models. This step is necessary only once.
#twinbooster.download_models()
tb = twinbooster.TwinBooster()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
# load the twinbooster model

for use_clustering in Use_clusterings:

    print (f'---> Use clustering: {use_clustering}')

    folder_path = os.path.join(SPLIT_CAT_DATASETS_DIR, 'cls', 'use_clustering'+'_'+str(use_clustering))
    files = os.listdir(folder_path)
    split_files = [file for file in files if file.endswith('splited.csv')]

    for split_file in split_files:
        
        print(f'Take the split file: {split_file}\n')
        split_file_path = os.path.join(folder_path, split_file)
        df = pd.read_csv(split_file_path)

        # get the smiles and assay descriptions in the test set
        test_smis = df[df['split'] == 'test']['canonical_smiles_by_Std'].tolist()
        print(f'The length of test_smis is {len(test_smis)}')

        test_assay_descs = df[df['split'] == 'test']['assay_desc'].tolist()
        print(f'\nThe length of test_assay_descs is {len(test_assay_descs)}')

        # zip the smiles and assay descriptions
        test_data = list(zip(test_smis, test_assay_descs))
        probs = []
        confs = []
        preds = []
        for i in test_data:
            smi = i[0]
            assay_desc = i[1]
            prob, conf = tb.predict([smi], [assay_desc], get_confidence=True)
            probs.append(prob)
            confs.append(conf)
            
            if prob > 0.5:
                preds.append(1.0)
            else:
                preds.append(0.0)
    
        print(f'probs: {probs}')
        print(f'confs: {confs}')
        print(f'preds: {preds}')

        
        # get the true labels
        trues = df[df['split'] == 'test']['activity'].tolist()
        # set the data type of the true labels to int
        trues = [int(i) for i in trues]
        print(f'true_list: {trues}')   

        # calculate the metrics
        accuracy = calc_accuracy(trues, preds)
        precision = calc_precision(trues, preds)
        recall = calc_recall(trues, preds)
        # roc_auc = calc_roc_auc(true_list, probs_list)
        # f1 = calc_f1(true_list, pred_list)
        mcc = calc_mcc(trues, preds)
        
        #bedroc_dec5 = bedroc_score(true_list, pred_list, alpha=321.9)
        #bedroc_2 = bedroc_score(true_list, pred_list, alpha=80.5)
        #bedroc_8 = bedroc_score(true_list, pred_list, alpha=20.0)

        # columns to be written to the results file
        file_path_name = 'CAT_ORs'
        task = 'cls'
        use_smote = 'None'
        target = df['target_chembl_id'].iloc[0]
        effect = df['effect'].iloc[0]
        assay = df['assay'].iloc[0]
        std_type = df['std_type'].iloc[0]
        descriptor = 'FP'
        algoname = 'TB'
        n_compounds = len(df)
        n_cliff_compounds = 'NA'
        n_compounds_train = len(df[df['split'] == 'train'])
        n_cliff_compounds_train = 'NA'
        n_compounds_test = len(df[df['split'] == 'test'])
        n_cliff_compounds_test = 'NA'
        threshold = df['threshold'].iloc[0]
        rmse = 'None'
        cliff_rmse = 'None'
        r2 = 'None'
        cliff_r2 = 'None'

        # temp
        bedroc_dec5 = 'None'
        bedroc_2 = 'None'
        bedroc_8 = 'None'
        roc_auc = 'None'
        f1 = 'None'

        if not os.path.isfile(os.path.join(BMK_CAT_DIR , 'results_tb.csv')):
            with open(os.path.join(BMK_CAT_DIR , 'results_tb.csv'), 'w') as f:
                f.write('file_path,task,use_clustering,use_smote,'
                        'target,effect,assay,std_type,descriptor,algo,'
                        'n_compounds,n_cliff_compounds,n_compounds_train,n_cliff_compounds_train,n_compounds_test,n_cliff_compounds_test,'
                        'threshold,'
                        'accuracy, precision, recall, mcc,bedroc_dec5,bedroc_2,bedroc_8,'
                        'rmse,cliff_rmse,r2,cliff_r2\n')
        
        with open(os.path.join(BMK_CAT_DIR , 'results_tb.csv'), 'a') as f:
            f.write(f'{file_path_name},{task},{use_clustering},{use_smote},'
                    f'{target},{effect},{assay},{std_type},{descriptor},{algoname},'
                    f'{n_compounds},{n_cliff_compounds},{n_compounds_train},{n_cliff_compounds_train},{n_compounds_test},{n_cliff_compounds_test},'
                    f'{threshold},' 
                    f'{accuracy},{precision},{recall},{mcc},{bedroc_dec5},{bedroc_2},{bedroc_8},'
                    f'{rmse},{cliff_rmse},{r2},{cliff_r2} \n')

---> Use clustering: True
Take the split file: dor_agon_G_cAMP_EC50_splited.csv

The length of test_smis is 27

The length of test_assay_descs is 27
probs: [array([0.46895376]), array([0.44638165]), array([0.45221861]), array([0.45083999]), array([0.21372387]), array([0.44671968]), array([0.45793426]), array([0.36516226]), array([0.43787947]), array([0.3611188]), array([0.18047745]), array([0.38349454]), array([0.47225773]), array([0.39455786]), array([0.32381417]), array([0.22686345]), array([0.27639174]), array([0.30099608]), array([0.15401912]), array([0.2001515]), array([0.15789642]), array([0.38445325]), array([0.38445325]), array([0.24645861]), array([0.25148914]), array([0.28126216]), array([0.19979572])]
confs: [[0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [1], [1], [1], [1], [1], [1], [0], [0], [1], [1], [1], [1]]
preds: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

  _warn_prf(average, modifier, msg_start, len(result))


probs: [array([0.35698044]), array([0.35750669]), array([0.39125979]), array([0.47135046]), array([0.30348669]), array([0.28803971]), array([0.31532721]), array([0.26133236]), array([0.38658658]), array([0.26190273]), array([0.27937554]), array([0.28913333]), array([0.25922322]), array([0.2533792]), array([0.27077644]), array([0.2773284])]
confs: [[0], [0], [0], [0], [0], [1], [0], [1], [0], [1], [1], [1], [1], [1], [1], [1]]
preds: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
true_list: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
Take the split file: dor_antag_G_GTP_IC50_splited.csv

The length of test_smis is 35

The length of test_assay_descs is 35


  _warn_prf(average, modifier, msg_start, len(result))


probs: [array([0.49715446]), array([0.35887446]), array([0.4254411]), array([0.34995399]), array([0.4892363]), array([0.4927383]), array([0.44733029]), array([0.18092464]), array([0.23603157]), array([0.16059234]), array([0.21939865]), array([0.32063671]), array([0.27798742]), array([0.41320692]), array([0.36160857]), array([0.34759018]), array([0.38984005]), array([0.35604164]), array([0.28175367]), array([0.17241792]), array([0.25376507]), array([0.46221131]), array([0.31231776]), array([0.29726203]), array([0.32270433]), array([0.28722912]), array([0.29410855]), array([0.36057209]), array([0.21077087]), array([0.30026098]), array([0.47703044]), array([0.31275087]), array([0.34043707]), array([0.23399601]), array([0.38670719])]
confs: [[0], [0], [0], [0], [0], [0], [0], [1], [1], [1], [1], [0], [1], [0], [0], [0], [0], [0], [1], [1], [1], [0], [0], [1], [0], [1], [1], [0], [1], [1], [0], [0], [0], [1], [0]]
preds: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

  _warn_prf(average, modifier, msg_start, len(result))


probs: [array([0.54851512]), array([0.5643085]), array([0.56975441]), array([0.54300488]), array([0.44963493]), array([0.44686002]), array([0.48975211]), array([0.28549963]), array([0.5599405]), array([0.26499809]), array([0.39694673]), array([0.44100336]), array([0.36109945]), array([0.38679878]), array([0.28058783]), array([0.28691664])]
confs: [[0], [0], [0], [0], [0], [0], [0], [1], [0], [1], [0], [0], [0], [0], [1], [1]]
preds: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
true_list: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Take the split file: dor_bind_RBA_IC50_splited.csv

The length of test_smis is 140

The length of test_assay_descs is 140
probs: [array([0.45965203]), array([0.3523454]), array([0.27131485]), array([0.281968]), array([0.40712192]), array([0.70875991]), array([0.68528814]), array([0.49845836]), array([0.465609]), array([0.34023435]), array([0.54150295]), array([0.29428436]), array([0.22995886]), array([0.30609026]), arr

  _warn_prf(average, modifier, msg_start, len(result))


probs: [array([0.11127863]), array([0.05358706]), array([0.14866484]), array([0.3475476]), array([0.05696756]), array([0.24871493]), array([0.19010563]), array([0.21315894]), array([0.42865536]), array([0.14957134]), array([0.13284658]), array([0.15568949])]
confs: [[1], [1], [1], [0], [1], [1], [1], [1], [0], [1], [1], [1]]
preds: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
true_list: [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
Take the split file: dor_agon_G_GTP_EC50_splited.csv

The length of test_smis is 131

The length of test_assay_descs is 131


  _warn_prf(average, modifier, msg_start, len(result))


probs: [array([0.50124298]), array([0.7907614]), array([0.53883661]), array([0.46006065]), array([0.35822833]), array([0.55772435]), array([0.58167171]), array([0.69828425]), array([0.63700795]), array([0.51176488]), array([0.68073848]), array([0.49523253]), array([0.37685513]), array([0.32504548]), array([0.53807327]), array([0.4226904]), array([0.49873992]), array([0.26224202]), array([0.36730253]), array([0.3318112]), array([0.24487309]), array([0.6536749]), array([0.37053287]), array([0.39939047]), array([0.22995632]), array([0.10062256]), array([0.4138686]), array([0.59991368]), array([0.61334063]), array([0.63595749]), array([0.73824462]), array([0.35051575]), array([0.20299996]), array([0.41408504]), array([0.34740796]), array([0.35315796]), array([0.36757031]), array([0.21102691]), array([0.18866512]), array([0.14325457]), array([0.14547043]), array([0.13235218]), array([0.16811203]), array([0.22172419]), array([0.30608055]), array([0.40304158]), array([0.20057839]), array([0.2

  _warn_prf(average, modifier, msg_start, len(result))


probs: [array([0.52233735]), array([0.49485395]), array([0.24229117]), array([0.18504933]), array([0.18936466]), array([0.20886467]), array([0.18466566]), array([0.24077515]), array([0.19823476]), array([0.17651839]), array([0.10616187]), array([0.22868996]), array([0.37922559]), array([0.34537018]), array([0.34444348]), array([0.41866653]), array([0.25155219])]
confs: [[0], [0], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [0], [0], [0], [0], [1]]
preds: [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
true_list: [1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
Take the split file: nor_bind_RBA_IC50_splited.csv

The length of test_smis is 86

The length of test_assay_descs is 86
probs: [array([0.51934239]), array([0.44995106]), array([0.44260768]), array([0.42983583]), array([0.4434124]), array([0.55969818]), array([0.46977856]), array([0.42499658]), array([0.54394211]), array([0.49210682]), array([0.49210682]), array([0.4464193]), array([0

  _warn_prf(average, modifier, msg_start, len(result))


probs: [array([0.35698044]), array([0.24698596]), array([0.39125979]), array([0.31887261]), array([0.34162769]), array([0.24849901]), array([0.26683018]), array([0.35046422]), array([0.22641322]), array([0.32420252]), array([0.28377802]), array([0.26190273]), array([0.2829091]), array([0.2533792])]
confs: [[0], [1], [0], [0], [0], [1], [1], [0], [1], [0], [1], [1], [1], [1]]
preds: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
true_list: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
Take the split file: dor_antag_G_GTP_IC50_splited.csv

The length of test_smis is 34

The length of test_assay_descs is 34


  _warn_prf(average, modifier, msg_start, len(result))


probs: [array([0.51024323]), array([0.35845683]), array([0.47013837]), array([0.44086388]), array([0.53629697]), array([0.52269464]), array([0.4892363]), array([0.42050027]), array([0.47747012]), array([0.45613956]), array([0.39410018]), array([0.2054657]), array([0.24762789]), array([0.11054028]), array([0.11276997]), array([0.34925764]), array([0.27157521]), array([0.2917986]), array([0.3353049]), array([0.32813647]), array([0.35604164]), array([0.25376507]), array([0.68400305]), array([0.21762299]), array([0.33544157]), array([0.28678713]), array([0.34719718]), array([0.36057209]), array([0.34047334]), array([0.32962382]), array([0.32073976]), array([0.36773673]), array([0.30649947]), array([0.28774632])]
confs: [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [1], [1], [1], [0], [1], [1], [0], [0], [0], [1], [1], [1], [0], [1], [0], [0], [0], [0], [0], [0], [0], [1]]
preds: [1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

  _warn_prf(average, modifier, msg_start, len(result))


probs: [array([0.05888347]), array([0.06992743]), array([0.24539919]), array([0.13043698]), array([0.24871493]), array([0.2208061]), array([0.42865536]), array([0.19820406]), array([0.15928122]), array([0.15568949])]
confs: [[1], [1], [1], [1], [1], [1], [0], [1], [1], [1]]
preds: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
true_list: [1, 0, 1, 1, 1, 0, 1, 0, 0, 0]
Take the split file: dor_agon_G_GTP_EC50_splited.csv

The length of test_smis is 130

The length of test_assay_descs is 130


  _warn_prf(average, modifier, msg_start, len(result))


probs: [array([0.61465318]), array([0.48613593]), array([0.53883661]), array([0.38030188]), array([0.49472522]), array([0.61138426]), array([0.69828425]), array([0.63312352]), array([0.68073848]), array([0.45273884]), array([0.40878585]), array([0.41416922]), array([0.37848602]), array([0.42715627]), array([0.47206693]), array([0.45273884]), array([0.4226904]), array([0.43052314]), array([0.45042053]), array([0.37560998]), array([0.46886917]), array([0.43165127]), array([0.3150326]), array([0.31648476]), array([0.23821238]), array([0.28954297]), array([0.36403132]), array([0.24857672]), array([0.6536749]), array([0.36862087]), array([0.25179053]), array([0.61121992]), array([0.59991368]), array([0.59007674]), array([0.61334063]), array([0.85015166]), array([0.73824462]), array([0.20717981]), array([0.14753363]), array([0.20299996]), array([0.20299229]), array([0.31826743]), array([0.41408504]), array([0.31826743]), array([0.3748635]), array([0.14325457]), array([0.13393386]), array([0.

  _warn_prf(average, modifier, msg_start, len(result))


probs: [array([0.40545052]), array([0.33961796]), array([0.28822441]), array([0.11688151]), array([0.37091475]), array([0.58180313]), array([0.47314904]), array([0.47989469]), array([0.40447533]), array([0.31951918]), array([0.25427511]), array([0.25427511]), array([0.58993649]), array([0.37866689]), array([0.46930022]), array([0.48424536]), array([0.37678469]), array([0.38590291]), array([0.38590291]), array([0.47970669]), array([0.47970669]), array([0.40687431]), array([0.41332968]), array([0.41332968]), array([0.44944205]), array([0.22408806]), array([0.33714482]), array([0.20703705]), array([0.4518297]), array([0.84843932]), array([0.62007886]), array([0.15655432]), array([0.20178006]), array([0.2009996]), array([0.18338091]), array([0.37300631]), array([0.68347906]), array([0.4880734]), array([0.40324684]), array([0.48979727]), array([0.30067374]), array([0.27376405]), array([0.48346604]), array([0.62681713]), array([0.60993882]), array([0.13535394]), array([0.28568349]), array([0

conf = confidence

`confidence = [1 if (x >= active_threshold) or (x <= inactive_threshold) else 0 for x in pred]`

pred --> predict --> predicted probabilities

`pred = self.lgbm.predict_proba(embedding)[:, 1]`