In [2]:
# conda env: clamp_env (Python 3.8.0)
import os
import sys

import pandas as pd
import torch
import torch.nn.functional as F
import numpy 

from datacat4ml.const import *
from datacat4ml.utils import mkdirs
from datacat4ml.Scripts.model_dev.metrics import *

# clamp models can only be used for classification tasks
import clamp

# Load the test datasets

In [3]:
for use_clustering in Use_clusterings:

    print (f'---> Use clustering: {use_clustering}')

    folder_path = os.path.join(SPLIT_CAT_DATASETS_DIR, 'cls', 'use_clustering'+'_'+str(use_clustering))
    files = os.listdir(folder_path)
    split_files = [file for file in files if file.endswith('split.csv')]

    for split_file in split_files:
        
        print(f'Take the split file: {split_file}\n')
        split_file_path = os.path.join(folder_path, split_file)
        df = pd.read_csv(split_file_path)

        # load the clamp model
        model = clamp.CLAMP(device='cpu') # after runnning this file, the folder `data/models/clamp_clip' will be created and stroe the downloaded models
        model.eval()

        # get the smiles and assay descriptions in the test set
        test_smis = df[df['split'] == 'test']['canonical_smiles_by_Std'].tolist()
        print(f'The length of test_smis is {len(test_smis)}')

        test_assay_descs = df[df['split'] == 'test']['assay_desc'].tolist()
        print(f'\nThe length of test_assay_descs is {len(test_assay_descs)}')

        # zip the smiles and assay descriptions
        test_data = zip(test_smis, test_assay_descs)

        # get the true labels
        true_list = df[df['split'] == 'test']['activity'].tolist()

        # calculate the probabilities of the test set
        pred_list = []
        for smi, assay_desc in test_data:
            with torch.no_grad():
                logits = model.forward_dense(smi, assay_desc)
                probs = logits.softmax(dim=0).cpu().numpy()
            active = logits > 0 # The prediction of whether the compound is active or not, if logits > 0, the value is True, meaning the compound is active
            
            pred_list.append(active)            
        
        accuracy = (pred_list.flatten() == torch.tensor(true_list)).float().mean()
        print(f'The accuracy_score of is: {accuracy}')

        # # get the predictions based on the probabilities
        # probs_list = []
        # pred_list = []
        # for i in range(len(probs)):
        #     probs_list.append(probs[i][i])
        #     if probs[i][i] > 0.5:
        #         pred_list.append(1)
        #     else:
        #         pred_list.append(0)
        # print(f'probs_list: {probs_list}')
        # print(f'pred_list: {pred_list}')
        # 
        # # get the true labels
        # true_list = df[df['split'] == 'test']['activity'].tolist()
        # # set the data type of the true labels to int
        # true_list = [int(i) for i in true_list]
        # print(f'true_list: {true_list}')   

        # calculate the metrics
        #accuracy = calc_accuracy(true_list, pred_list)
        precision = calc_precision(true_list, pred_list)
        recall = calc_recall(true_list, pred_list)
        # roc_auc = calc_roc_auc(true_list, probs_list)
        # f1 = calc_f1(true_list, pred_list)
        mcc = calc_mcc(true_list, pred_list)
        
        #bedroc_dec5 = bedroc_score(true_list, pred_list, alpha=321.9)
        #bedroc_2 = bedroc_score(true_list, pred_list, alpha=80.5)
        #bedroc_8 = bedroc_score(true_list, pred_list, alpha=20.0)

        # columns to be written to the results file
        file_path_name = 'CAT_ORs'
        task = 'cls'
        use_smote = 'None'
        target = df['target_chembl_id'].iloc[0]
        effect = df['effect'].iloc[0]
        assay = df['assay'].iloc[0]
        std_type = df['std_type'].iloc[0]
        descriptor = 'FP'
        algoname = 'CLAMP'
        n_compounds = len(df)
        n_cliff_compounds = 'NA'
        n_compounds_train = len(df[df['split'] == 'train'])
        n_cliff_compounds_train = 'NA'
        n_compounds_test = len(df[df['split'] == 'test'])
        n_cliff_compounds_test = 'NA'
        threshold = df['threshold'].iloc[0]
        rmse = 'None'
        cliff_rmse = 'None'
        r2 = 'None'
        cliff_r2 = 'None'

        # temp
        bedroc_dec5 = 'None'
        bedroc_2 = 'None'
        bedroc_8 = 'None'
        roc_auc = 'None'
        f1 = 'None'

        if not os.path.isfile(os.path.join(BMK_CAT_DIR , 'results_clamp.csv')):
            with open(os.path.join(BMK_CAT_DIR , 'results_clamp.csv'), 'w') as f:
                f.write('file_path,task,use_clustering,use_smote,'
                        'target,effect,assay,std_type,descriptor,algo,'
                        'n_compounds,n_cliff_compounds,n_compounds_train,n_cliff_compounds_train,n_compounds_test,n_cliff_compounds_test,'
                        'threshold,'
                        'accuracy,precision,recall,mcc,bedroc_dec5,bedroc_2,bedroc_8,'
                        'rmse,cliff_rmse,r2,cliff_r2\n')
        
        with open(os.path.join(BMK_CAT_DIR , 'results_clamp.csv'), 'a') as f:
            f.write(f'{file_path_name},{task},{use_clustering},{use_smote},'
                    f'{target},{effect},{assay},{std_type},{descriptor},{algoname},'
                    f'{n_compounds},{n_cliff_compounds},{n_compounds_train},{n_cliff_compounds_train},{n_compounds_test},{n_cliff_compounds_test},'
                    f'{threshold},' 
                    f'{accuracy},{precision},{recall},{mcc},{bedroc_dec5},{bedroc_2},{bedroc_8},'
                    f'{rmse},{cliff_rmse},{r2},{cliff_r2} \n')

---> Use clustering: True
Take the split file: kor_agon_B_arrest_EC50_split.csv



  cp = torch.load(self.checkpoint, map_location=device)
[32m2025-01-09 20:09:51.822[0m | [1mINFO    [0m | [36mclamp.models.pretrained[0m:[36m__init__[0m:[36m33[0m - [1mLoaded pretrained model from data/models/clamp_clip/checkpoint.pt[0m


The length of test_smis is 12

The length of test_assay_descs is 12


  from .autonotebook import tqdm as notebook_tqdm
[20:09:53] SMILES Parse Error: syntax error while parsing: [
[20:09:53] SMILES Parse Error: Failed parsing SMILES '[' for input: '['

[20:09:53] SMILES Parse Error: syntax error while parsing: @
[20:09:53] SMILES Parse Error: Failed parsing SMILES '@' for input: '@'

[20:09:53] SMILES Parse Error: syntax error while parsing: H
[20:09:53] SMILES Parse Error: Failed parsing SMILES 'H' for input: 'H'

[20:09:53] SMILES Parse Error: syntax error while parsing: ]
[20:09:53] SMILES Parse Error: Failed parsing SMILES ']' for input: ']'

[20:09:53] SMILES Parse Error: syntax error while parsing: 1
[20:09:53] SMILES Parse Error: Failed parsing SMILES '1' for input: '1'

[20:09:53] SMILES Parse Error: syntax error while parsing: [
[20:09:53] SMILES Parse Error: Failed parsing SMILES '[' for input: '['

[20:09:53] SMILES Parse Error: syntax error while parsing: @
[20:09:53] SMILES Parse Error: Failed parsing SMILES '@' for input: '@'

[20:09:53] S

AttributeError: 'list' object has no attribute 'flatten'