In [5]:
# conda env: pyg (Python3.9.16)
# --> 
# 1. improve it by adding multiprocessing feature
# 2. add scaffold split for classification task. Because now three spliting methods:
#    2.1 cluster split + stratified by cliff

import os
import sys
from typing import List, Union
import shutil

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.cluster import SpectralClustering

from datacat4ml.utils import mkdirs, get_df_name
from datacat4ml.const import *
from datacat4ml.Scripts.data_prep.data_split.split_utils.cliff import ActivityCliffs, get_tanimoto_matrix

# Split the categorized datasets

In [None]:
def data_spliter(smiles: List[str], pStandard_value: Union[List[float], np.array], 
                 similarity: float=0.9, potency_fold: int=1, 
                 use_clustering: bool=True, n_clusters: int=5, test_size: float=0.2, task: str='cls') -> pd.DataFrame:
    """
    Split the data into train and test sets according to activity cliffs and compound charateristics.

    :param smiles: List of SMILES strings
    :param pStandard_value: List of pStandard_values or np.array of pStandard_values
    :param activity: List of active/inactive labels, where 1 is active and 0 is inactive
    :param similarity: Threshold value to determine structural similarity
    :param potency_fold: Threshold value to determine potency difference, where the potency here is the pStandard_value
    :param use_clustering: Whether to use clustering to split the data
    :param n_clusters: Number of clusters to use if clustering is used
    :param test_size: Test set size

    :return: A dataframe 
    """

    if len(smiles) < 50:
        print(f"The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.")
        return None

    if use_clustering:
        # # cluster the dabaset into 5 clusters based on tanimoto distance matrix
        spectral = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', random_state=RANDOM_SEED)
        clusters = spectral.fit(get_tanimoto_matrix(smiles)).labels_ # get the cluster labels for each molecule

        train_idx, test_idx = [], []
        for cluster in range(n_clusters):
            num_in_cluster = len(np.where(clusters == cluster)[0])
            print(f"Cluster {cluster}: {num_in_cluster} data points")

            # get the indices of molecules in the current cluster
            cluster_idx = np.where(clusters == cluster)[0] # `[0]` is need to convert the tuple to a list

            if task == 'reg':
                # get activity cliffs
                cliffs = ActivityCliffs(smiles, pStandard_value)
                cliff_mols = cliffs.get_cliff_molecules(return_smiles=False, similarity=similarity, potency_fold=potency_fold)
                clust_cliff_mols = [cliff_mols[i] for i in cluster_idx]
            
                # can only split data stratified by cliff molecules if there are more than 1 cliff molecules in the cluster, else just split randomly
                if sum(clust_cliff_mols) > 2:
                    try: # try to avoid the Value ERROR when the number of class in the training set is less than 2
                        clust_train_idx, clust_test_idx = train_test_split(cluster_idx, test_size=test_size, 
                                                                            stratify=clust_cliff_mols, # ensure that the train/test split has the same proportion of cliff molecules
                                                                            random_state=RANDOM_SEED, shuffle=True)
                    except ValueError:
                        # This block will be executed if an error occurs in the try block
                        print("An error occurred while stratifying data based on cliff molecules. Skipping stratification.")
                        clust_train_idx, clust_test_idx = train_test_split(cluster_idx, test_size=test_size, 
                                                                            random_state=RANDOM_SEED, shuffle=True)
                else:
                    clust_train_idx, clust_test_idx = train_test_split(cluster_idx, test_size=test_size, 
                                                                        random_state=RANDOM_SEED, shuffle=True)

            elif task == 'cls':
                print("Classification task")
                clust_train_idx, clust_test_idx = train_test_split(cluster_idx, test_size=test_size, 
                                                                    random_state=RANDOM_SEED, shuffle=True)
                print(f'done splitting in task {task}')

            train_idx.extend(clust_train_idx)
            test_idx.extend(clust_test_idx)
            
    else:
        # don't use clustering before spliting. This is to avoid potential cheating by using clustering to split the data, 
        # which make the structual diversity of the train and test sets the same
        if task == 'reg':
            if sum(cliff_mols) > 2:
                train_idx, test_idx = train_test_split(range(len(smiles)), test_size=test_size, 
                                                stratify=cliff_mols, # ensure that the train/test split has the same proportion of cliff molecules
                                                random_state=RANDOM_SEED, shuffle=True)
            else:
                train_idx, test_idx = train_test_split(range(len(smiles)), test_size=test_size, 
                                                random_state=RANDOM_SEED, shuffle=True)
        elif task == 'cls':
            train_idx, test_idx = train_test_split(range(len(smiles)), test_size=test_size, 
                                                random_state=RANDOM_SEED, shuffle=True)
            
    train_test = []
    for i in range(len(smiles)):
        if i in train_idx:
            train_test.append('train')
        elif i in test_idx:
            train_test.append('test')
        else:
            raise ValueError('Index not in train or test set')
    
    if task == 'reg':
        return pd.DataFrame({'cliff_mol': cliff_mols,
                            'split': train_test})
    elif task == 'cls':
        return pd.DataFrame({'split': train_test})

In [None]:
def split_data(input_filepath = CURA_CAT_DATASETS_DIR, output_filepath = SPLIT_CAT_DATASETS_DIR,
               task:str = 'cls', use_clustering: bool=True):
    
    # access the final csv files obtained from data curation
    folder_path = os.path.join(input_filepath, task)
    files = os.listdir(folder_path)
    curated_files = [file for file in files if file.endswith('curated.csv')]

    # make new directory to store the featurized data
    output_path = os.path.join(output_filepath, task, 'use_clustering'+'_'+str(use_clustering))
    if os.path.exists(output_path):
        shutil.rmtree(output_path)
    mkdirs(output_path)

    for curated_file in curated_files:
        print (f"curated_file is: {curated_file}\n")

        df = pd.read_csv(os.path.join(folder_path, curated_file))
        df = df.drop(columns=['Unnamed: 0'])

        # split the data into train and test sets
        data_splited_df = data_spliter(df['canonical_smiles_by_Std'].tolist(), df['pStandard_value'].tolist(), df['activity'].tolist(), 
                                       use_clustering=use_clustering, task=task)
        
        # save data_splited_df as a csv file if it is not None
        if data_splited_df is not None:
            concat_df = pd.concat([df, data_splited_df], axis=1)
            concat_df.to_csv(os.path.join(output_path, f'{curated_file[:-11]}split.csv'), index=False)

In [8]:
use_clusterings = [True, False]

for task in Tasks:
    print (f"task is: {task}\n")
    
    for use_clustering in use_clusterings:
        print (f"use_clustering is: {use_clustering}\n")
        
        split_data(input_filepath = CURA_CAT_DATASETS_DIR, output_filepath = SPLIT_CAT_DATASETS_DIR, 
                   task=task, use_clustering=use_clustering)
        print(f'Done!\n====================================\n')
#It took around 1 hour to run the above code locally

task is: cls

use_clustering is: True

curated_file is: nor_antag_G_GTP_IC50_curated.csv



100%|██████████| 150/150 [00:00<00:00, 7404.85it/s]


Cluster 0: 32 data points
Classification task
done splitting in task cls
Cluster 1: 26 data points
Classification task
done splitting in task cls
Cluster 2: 44 data points
Classification task
done splitting in task cls
Cluster 3: 40 data points
Classification task
done splitting in task cls
Cluster 4: 8 data points
Classification task
done splitting in task cls
curated_file is: kor_antag_B_arrest_IC50_curated.csv



100%|██████████| 53/53 [00:00<00:00, 13594.55it/s]


Cluster 0: 18 data points
Classification task
done splitting in task cls
Cluster 1: 12 data points
Classification task
done splitting in task cls
Cluster 2: 6 data points
Classification task
done splitting in task cls
Cluster 3: 7 data points
Classification task
done splitting in task cls
Cluster 4: 10 data points
Classification task
done splitting in task cls
curated_file is: nor_agon_G_Ca_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_antag_B_arrest_IC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_agon_G_cAMP_IC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: kor_agon_G_cAMP_IC50_curated.csv

The number of data po

100%|██████████| 73/73 [00:00<00:00, 9838.19it/s]


Cluster 0: 19 data points
Classification task
done splitting in task cls
Cluster 1: 28 data points
Classification task
done splitting in task cls
Cluster 2: 10 data points
Classification task
done splitting in task cls
Cluster 3: 11 data points
Classification task
done splitting in task cls
Cluster 4: 5 data points
Classification task
done splitting in task cls
curated_file is: mor_antag_B_arrest_IC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: mor_agon_G_cAMP_IC50_curated.csv



100%|██████████| 55/55 [00:00<00:00, 12741.60it/s]


Cluster 0: 18 data points
Classification task
done splitting in task cls
Cluster 1: 18 data points
Classification task
done splitting in task cls
Cluster 2: 5 data points
Classification task
done splitting in task cls
Cluster 3: 6 data points
Classification task
done splitting in task cls
Cluster 4: 8 data points
Classification task
done splitting in task cls
curated_file is: mor_agon_G_GTP_EC50_curated.csv



100%|██████████| 980/980 [00:00<00:00, 1145.87it/s]


Cluster 0: 295 data points
Classification task
done splitting in task cls
Cluster 1: 242 data points
Classification task
done splitting in task cls
Cluster 2: 145 data points
Classification task
done splitting in task cls
Cluster 3: 222 data points
Classification task
done splitting in task cls
Cluster 4: 76 data points
Classification task
done splitting in task cls
curated_file is: dor_agon_G_cAMP_EC50_curated.csv



100%|██████████| 125/125 [00:00<00:00, 5983.38it/s]


Cluster 0: 28 data points
Classification task
done splitting in task cls
Cluster 1: 24 data points
Classification task
done splitting in task cls
Cluster 2: 18 data points
Classification task
done splitting in task cls
Cluster 3: 44 data points
Classification task
done splitting in task cls
Cluster 4: 11 data points
Classification task
done splitting in task cls
curated_file is: mor_antag_G_GTP_Ki_curated.csv



100%|██████████| 61/61 [00:00<00:00, 5937.08it/s]


Cluster 0: 10 data points
Classification task
done splitting in task cls
Cluster 1: 27 data points
Classification task
done splitting in task cls
Cluster 2: 6 data points
Classification task
done splitting in task cls
Cluster 3: 10 data points
Classification task
done splitting in task cls
Cluster 4: 8 data points
Classification task
done splitting in task cls
curated_file is: kor_agon_G_Ca_EC50_curated.csv



100%|██████████| 74/74 [00:00<00:00, 9183.61it/s]

Cluster 0: 5 data points
Classification task
done splitting in task cls
Cluster 1: 16 data points
Classification task
done splitting in task cls
Cluster 2: 45 data points
Classification task
done splitting in task cls
Cluster 3: 2 data points
Classification task
done splitting in task cls
Cluster 4: 6 data points
Classification task
done splitting in task cls
curated_file is: dor_antag_G_GTP_IC50_curated.csv




100%|██████████| 169/169 [00:00<00:00, 4425.64it/s]


Cluster 0: 54 data points
Classification task
done splitting in task cls
Cluster 1: 35 data points
Classification task
done splitting in task cls
Cluster 2: 18 data points
Classification task
done splitting in task cls
Cluster 3: 15 data points
Classification task
done splitting in task cls
Cluster 4: 47 data points
Classification task
done splitting in task cls
curated_file is: mor_agon_B_arrest_EC50_curated.csv



100%|██████████| 207/207 [00:00<00:00, 3630.85it/s]


Cluster 0: 47 data points
Classification task
done splitting in task cls
Cluster 1: 48 data points
Classification task
done splitting in task cls
Cluster 2: 45 data points
Classification task
done splitting in task cls
Cluster 3: 35 data points
Classification task
done splitting in task cls
Cluster 4: 32 data points
Classification task
done splitting in task cls
curated_file is: kor_bind_RBA_IC50_curated.csv



100%|██████████| 416/416 [00:00<00:00, 2606.02it/s]


Cluster 0: 53 data points
Classification task
done splitting in task cls
Cluster 1: 224 data points
Classification task
done splitting in task cls
Cluster 2: 70 data points
Classification task
done splitting in task cls
Cluster 3: 20 data points
Classification task
done splitting in task cls
Cluster 4: 49 data points
Classification task
done splitting in task cls
curated_file is: kor_agon_G_cAMP_EC50_curated.csv



100%|██████████| 253/253 [00:00<00:00, 3607.27it/s]


Cluster 0: 64 data points
Classification task
done splitting in task cls
Cluster 1: 39 data points
Classification task
done splitting in task cls
Cluster 2: 37 data points
Classification task
done splitting in task cls
Cluster 3: 65 data points
Classification task
done splitting in task cls
Cluster 4: 48 data points
Classification task
done splitting in task cls
curated_file is: kor_agon_G_GTP_EC50_curated.csv



100%|██████████| 1246/1246 [00:01<00:00, 887.07it/s]


Cluster 0: 124 data points
Classification task
done splitting in task cls
Cluster 1: 212 data points
Classification task
done splitting in task cls
Cluster 2: 411 data points
Classification task
done splitting in task cls
Cluster 3: 311 data points
Classification task
done splitting in task cls
Cluster 4: 188 data points
Classification task
done splitting in task cls
curated_file is: nor_agon_G_GTP_EC50_curated.csv



100%|██████████| 222/222 [00:00<00:00, 5079.40it/s]


Cluster 0: 42 data points
Classification task
done splitting in task cls
Cluster 1: 66 data points
Classification task
done splitting in task cls
Cluster 2: 20 data points
Classification task
done splitting in task cls
Cluster 3: 63 data points
Classification task
done splitting in task cls
Cluster 4: 31 data points
Classification task
done splitting in task cls
curated_file is: mor_bind_RBA_IC50_curated.csv



100%|██████████| 582/582 [00:00<00:00, 1942.64it/s]


Cluster 0: 77 data points
Classification task
done splitting in task cls
Cluster 1: 100 data points
Classification task
done splitting in task cls
Cluster 2: 54 data points
Classification task
done splitting in task cls
Cluster 3: 20 data points
Classification task
done splitting in task cls
Cluster 4: 331 data points
Classification task
done splitting in task cls
curated_file is: kor_antag_G_GTP_IC50_curated.csv



100%|██████████| 185/185 [00:00<00:00, 6196.71it/s]


Cluster 0: 48 data points
Classification task
done splitting in task cls
Cluster 1: 82 data points
Classification task
done splitting in task cls
Cluster 2: 23 data points
Classification task
done splitting in task cls
Cluster 3: 16 data points
Classification task
done splitting in task cls
Cluster 4: 16 data points
Classification task
done splitting in task cls
curated_file is: nor_agon_G_cAMP_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: nor_bind_RBA_IC50_curated.csv



100%|██████████| 417/417 [00:00<00:00, 2730.85it/s]


Cluster 0: 201 data points
Classification task
done splitting in task cls
Cluster 1: 46 data points
Classification task
done splitting in task cls
Cluster 2: 33 data points
Classification task
done splitting in task cls
Cluster 3: 75 data points
Classification task
done splitting in task cls
Cluster 4: 62 data points
Classification task
done splitting in task cls
curated_file is: nor_agon_G_cAMP_IC50_curated.csv



100%|██████████| 69/69 [00:00<00:00, 10019.28it/s]

Cluster 0: 16 data points
Classification task
done splitting in task cls
Cluster 1: 8 data points
Classification task
done splitting in task cls
Cluster 2: 4 data points
Classification task
done splitting in task cls
Cluster 3: 5 data points
Classification task
done splitting in task cls
Cluster 4: 36 data points
Classification task
done splitting in task cls
curated_file is: dor_agon_B_arrest_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_agon_G_Ca_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: mor_bind_RBA_Ki_curated.csv




100%|██████████| 4654/4654 [00:19<00:00, 239.32it/s] 


Cluster 0: 2534 data points
Classification task
done splitting in task cls
Cluster 1: 1201 data points
Classification task
done splitting in task cls
Cluster 2: 149 data points
Classification task
done splitting in task cls
Cluster 3: 545 data points
Classification task
done splitting in task cls
Cluster 4: 225 data points
Classification task
done splitting in task cls
curated_file is: dor_agon_G_GTP_EC50_curated.csv



100%|██████████| 648/648 [00:00<00:00, 1725.12it/s]


Cluster 0: 115 data points
Classification task
done splitting in task cls
Cluster 1: 110 data points
Classification task
done splitting in task cls
Cluster 2: 224 data points
Classification task
done splitting in task cls
Cluster 3: 46 data points
Classification task
done splitting in task cls
Cluster 4: 153 data points
Classification task
done splitting in task cls
curated_file is: mor_agon_G_Ca_EC50_curated.csv



100%|██████████| 144/144 [00:00<00:00, 5138.81it/s]


Cluster 0: 24 data points
Classification task
done splitting in task cls
Cluster 1: 25 data points
Classification task
done splitting in task cls
Cluster 2: 26 data points
Classification task
done splitting in task cls
Cluster 3: 35 data points
Classification task
done splitting in task cls
Cluster 4: 34 data points
Classification task
done splitting in task cls
curated_file is: kor_antag_G_GTP_Ki_curated.csv



100%|██████████| 53/53 [00:00<00:00, 8807.02it/s]


Cluster 0: 12 data points
Classification task
done splitting in task cls
Cluster 1: 3 data points
Classification task
done splitting in task cls
Cluster 2: 10 data points
Classification task
done splitting in task cls
Cluster 3: 10 data points
Classification task
done splitting in task cls
Cluster 4: 18 data points
Classification task
done splitting in task cls
curated_file is: mor_antag_G_GTP_IC50_curated.csv



100%|██████████| 339/339 [00:00<00:00, 3089.08it/s]


Cluster 0: 71 data points
Classification task
done splitting in task cls
Cluster 1: 113 data points
Classification task
done splitting in task cls
Cluster 2: 56 data points
Classification task
done splitting in task cls
Cluster 3: 17 data points
Classification task
done splitting in task cls
Cluster 4: 82 data points
Classification task
done splitting in task cls
curated_file is: nor_bind_RBA_Ki_curated.csv



100%|██████████| 1142/1142 [00:01<00:00, 976.49it/s]


Cluster 0: 527 data points
Classification task
done splitting in task cls
Cluster 1: 245 data points
Classification task
done splitting in task cls
Cluster 2: 85 data points
Classification task
done splitting in task cls
Cluster 3: 141 data points
Classification task
done splitting in task cls
Cluster 4: 144 data points
Classification task
done splitting in task cls
curated_file is: mor_agon_G_cAMP_EC50_curated.csv



100%|██████████| 435/435 [00:00<00:00, 2608.77it/s]


Cluster 0: 110 data points
Classification task
done splitting in task cls
Cluster 1: 64 data points
Classification task
done splitting in task cls
Cluster 2: 57 data points
Classification task
done splitting in task cls
Cluster 3: 78 data points
Classification task
done splitting in task cls
Cluster 4: 126 data points
Classification task
done splitting in task cls
curated_file is: kor_agon_B_arrest_EC50_curated.csv



100%|██████████| 50/50 [00:00<00:00, 13483.04it/s]

Cluster 0: 4 data points
Classification task
done splitting in task cls
Cluster 1: 13 data points
Classification task
done splitting in task cls
Cluster 2: 9 data points
Classification task
done splitting in task cls
Cluster 3: 12 data points
Classification task
done splitting in task cls
Cluster 4: 12 data points
Classification task
done splitting in task cls
curated_file is: dor_bind_RBA_Ki_curated.csv




100%|██████████| 4035/4035 [00:14<00:00, 276.31it/s] 


Cluster 0: 226 data points
Classification task
done splitting in task cls
Cluster 1: 875 data points
Classification task
done splitting in task cls
Cluster 2: 2335 data points
Classification task
done splitting in task cls
Cluster 3: 150 data points
Classification task
done splitting in task cls
Cluster 4: 449 data points
Classification task
done splitting in task cls
curated_file is: kor_bind_RBA_Ki_curated.csv



100%|██████████| 3860/3860 [00:13<00:00, 286.29it/s] 


Cluster 0: 301 data points
Classification task
done splitting in task cls
Cluster 1: 1936 data points
Classification task
done splitting in task cls
Cluster 2: 318 data points
Classification task
done splitting in task cls
Cluster 3: 401 data points
Classification task
done splitting in task cls
Cluster 4: 904 data points
Classification task
done splitting in task cls
curated_file is: nor_antag_G_GTP_Ki_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_bind_RBA_IC50_curated.csv



100%|██████████| 693/693 [00:00<00:00, 1622.97it/s]


Cluster 0: 82 data points
Classification task
done splitting in task cls
Cluster 1: 315 data points
Classification task
done splitting in task cls
Cluster 2: 154 data points
Classification task
done splitting in task cls
Cluster 3: 88 data points
Classification task
done splitting in task cls
Cluster 4: 54 data points
Classification task
done splitting in task cls
Done!

use_clustering is: False

curated_file is: nor_antag_G_GTP_IC50_curated.csv

curated_file is: kor_antag_B_arrest_IC50_curated.csv

curated_file is: nor_agon_G_Ca_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_antag_B_arrest_IC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_agon_G_cAMP_IC50_curated.csv

The number of data points in this file is lower than 50, which is n

100%|██████████| 120/120 [00:00<00:00, 9307.75it/s]


Cluster 0: 22 data points


100%|██████████| 120/120 [00:00<00:00, 3502.28it/s]
100%|██████████| 120/120 [00:00<00:00, 9229.58it/s]
100%|██████████| 120/120 [00:00<00:00, 16257.52it/s]
100%|██████████| 120/120 [00:00<00:00, 33558.91it/s]


Cluster 1: 44 data points


100%|██████████| 120/120 [00:00<00:00, 9184.61it/s]
100%|██████████| 120/120 [00:00<00:00, 9053.76it/s]
100%|██████████| 120/120 [00:00<00:00, 16420.35it/s]
100%|██████████| 120/120 [00:00<00:00, 33952.81it/s]


Cluster 2: 24 data points


100%|██████████| 120/120 [00:00<00:00, 9381.31it/s]
100%|██████████| 120/120 [00:00<00:00, 8868.23it/s]
100%|██████████| 120/120 [00:00<00:00, 16059.36it/s]
100%|██████████| 120/120 [00:00<00:00, 33561.14it/s]


Cluster 3: 8 data points


100%|██████████| 120/120 [00:00<00:00, 9041.40it/s]
100%|██████████| 120/120 [00:00<00:00, 9006.93it/s]
100%|██████████| 120/120 [00:00<00:00, 16146.43it/s]
100%|██████████| 120/120 [00:00<00:00, 33165.29it/s]


Cluster 4: 22 data points


100%|██████████| 120/120 [00:00<00:00, 9032.63it/s]
100%|██████████| 120/120 [00:00<00:00, 8874.64it/s]
100%|██████████| 120/120 [00:00<00:00, 15698.71it/s]
100%|██████████| 120/120 [00:00<00:00, 33718.53it/s]


curated_file is: kor_antag_B_arrest_IC50_curated.csv



100%|██████████| 52/52 [00:00<00:00, 20659.64it/s]


Cluster 0: 18 data points


100%|██████████| 52/52 [00:00<00:00, 13469.02it/s]
100%|██████████| 52/52 [00:00<00:00, 13464.86it/s]
100%|██████████| 52/52 [00:00<00:00, 26581.82it/s]
100%|██████████| 52/52 [00:00<00:00, 55652.92it/s]


Cluster 1: 6 data points


100%|██████████| 52/52 [00:00<00:00, 13406.92it/s]
100%|██████████| 52/52 [00:00<00:00, 13309.56it/s]
100%|██████████| 52/52 [00:00<00:00, 26001.88it/s]
100%|██████████| 52/52 [00:00<00:00, 55090.63it/s]

Cluster 2: 12 data points



100%|██████████| 52/52 [00:00<00:00, 19550.36it/s]
100%|██████████| 52/52 [00:00<00:00, 20779.71it/s]
100%|██████████| 52/52 [00:00<00:00, 36350.63it/s]
100%|██████████| 52/52 [00:00<00:00, 68478.43it/s]


Cluster 3: 10 data points


100%|██████████| 52/52 [00:00<00:00, 21568.81it/s]
100%|██████████| 52/52 [00:00<00:00, 20799.52it/s]
100%|██████████| 52/52 [00:00<00:00, 36344.58it/s]
100%|██████████| 52/52 [00:00<00:00, 74949.76it/s]


Cluster 4: 6 data points


100%|██████████| 52/52 [00:00<00:00, 20875.17it/s]
100%|██████████| 52/52 [00:00<00:00, 20817.39it/s]
100%|██████████| 52/52 [00:00<00:00, 36181.79it/s]
100%|██████████| 52/52 [00:00<00:00, 74846.88it/s]


curated_file is: nor_agon_G_Ca_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_antag_B_arrest_IC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_agon_G_cAMP_IC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: kor_agon_G_cAMP_IC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_antag_G_GTP_Ki_curated.csv



100%|██████████| 71/71 [00:00<00:00, 15362.96it/s]


Cluster 0: 5 data points


100%|██████████| 71/71 [00:00<00:00, 9673.71it/s]
100%|██████████| 71/71 [00:00<00:00, 10005.90it/s]
100%|██████████| 71/71 [00:00<00:00, 18487.43it/s]
100%|██████████| 71/71 [00:00<00:00, 43505.56it/s]


Cluster 1: 26 data points


100%|██████████| 71/71 [00:00<00:00, 10074.28it/s]
100%|██████████| 71/71 [00:00<00:00, 15670.98it/s]
100%|██████████| 71/71 [00:00<00:00, 24463.61it/s]
100%|██████████| 71/71 [00:00<00:00, 55735.65it/s]


Cluster 2: 11 data points


100%|██████████| 71/71 [00:00<00:00, 15490.82it/s]
100%|██████████| 71/71 [00:00<00:00, 15537.70it/s]
100%|██████████| 71/71 [00:00<00:00, 24713.33it/s]
100%|██████████| 71/71 [00:00<00:00, 55496.75it/s]


Cluster 3: 10 data points


100%|██████████| 71/71 [00:00<00:00, 15544.19it/s]
100%|██████████| 71/71 [00:00<00:00, 15558.00it/s]
100%|██████████| 71/71 [00:00<00:00, 24395.48it/s]
100%|██████████| 71/71 [00:00<00:00, 56071.47it/s]


Cluster 4: 19 data points


100%|██████████| 71/71 [00:00<00:00, 15207.62it/s]
100%|██████████| 71/71 [00:00<00:00, 15435.42it/s]
100%|██████████| 71/71 [00:00<00:00, 24678.51it/s]
100%|██████████| 71/71 [00:00<00:00, 55300.94it/s]


curated_file is: mor_antag_B_arrest_IC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: mor_agon_G_cAMP_IC50_curated.csv



100%|██████████| 53/53 [00:00<00:00, 20454.37it/s]


Cluster 0: 17 data points


100%|██████████| 53/53 [00:00<00:00, 12961.99it/s]
100%|██████████| 53/53 [00:00<00:00, 13059.46it/s]
100%|██████████| 53/53 [00:00<00:00, 21719.41it/s]
100%|██████████| 53/53 [00:00<00:00, 46167.83it/s]


Cluster 1: 5 data points


100%|██████████| 53/53 [00:00<00:00, 13015.11it/s]
100%|██████████| 53/53 [00:00<00:00, 13049.49it/s]
100%|██████████| 53/53 [00:00<00:00, 22314.61it/s]
100%|██████████| 53/53 [00:00<00:00, 54258.75it/s]


Cluster 2: 17 data points


100%|██████████| 53/53 [00:00<00:00, 20199.74it/s]
100%|██████████| 53/53 [00:00<00:00, 20214.43it/s]
100%|██████████| 53/53 [00:00<00:00, 31012.57it/s]
100%|██████████| 53/53 [00:00<00:00, 73365.71it/s]


Cluster 3: 6 data points


100%|██████████| 53/53 [00:00<00:00, 20267.88it/s]
100%|██████████| 53/53 [00:00<00:00, 20254.95it/s]
100%|██████████| 53/53 [00:00<00:00, 31243.59it/s]
100%|██████████| 53/53 [00:00<00:00, 74124.08it/s]


Cluster 4: 8 data points


100%|██████████| 53/53 [00:00<00:00, 20575.54it/s]
100%|██████████| 53/53 [00:00<00:00, 20163.09it/s]
100%|██████████| 53/53 [00:00<00:00, 30986.63it/s]
100%|██████████| 53/53 [00:00<00:00, 73730.72it/s]


curated_file is: mor_agon_G_GTP_EC50_curated.csv



100%|██████████| 869/869 [00:00<00:00, 1275.88it/s]


Cluster 0: 228 data points


100%|██████████| 869/869 [00:00<00:00, 1264.23it/s]
100%|██████████| 869/869 [00:00<00:00, 1253.11it/s]
100%|██████████| 869/869 [00:00<00:00, 1576.28it/s]
100%|██████████| 869/869 [00:00<00:00, 4680.08it/s]


Cluster 1: 238 data points


100%|██████████| 869/869 [00:00<00:00, 1287.59it/s]
100%|██████████| 869/869 [00:00<00:00, 1255.92it/s]
100%|██████████| 869/869 [00:00<00:00, 1572.23it/s]
100%|██████████| 869/869 [00:00<00:00, 4721.88it/s]


Cluster 2: 55 data points


100%|██████████| 869/869 [00:00<00:00, 1286.98it/s]
100%|██████████| 869/869 [00:00<00:00, 1283.39it/s]
100%|██████████| 869/869 [00:00<00:00, 1584.84it/s]
100%|██████████| 869/869 [00:00<00:00, 4663.56it/s]


Cluster 3: 91 data points


100%|██████████| 869/869 [00:00<00:00, 1290.98it/s]
100%|██████████| 869/869 [00:00<00:00, 1260.65it/s]
100%|██████████| 869/869 [00:00<00:00, 1576.89it/s]
100%|██████████| 869/869 [00:00<00:00, 4722.67it/s]


Cluster 4: 257 data points


100%|██████████| 869/869 [00:00<00:00, 1288.95it/s]
100%|██████████| 869/869 [00:00<00:00, 1291.08it/s]
100%|██████████| 869/869 [00:00<00:00, 1567.65it/s]
100%|██████████| 869/869 [00:00<00:00, 4692.93it/s]


curated_file is: dor_agon_G_cAMP_EC50_curated.csv



100%|██████████| 113/113 [00:00<00:00, 9632.87it/s]


Cluster 0: 10 data points


100%|██████████| 113/113 [00:00<00:00, 6254.37it/s]
100%|██████████| 113/113 [00:00<00:00, 8386.23it/s]
100%|██████████| 113/113 [00:00<00:00, 14070.25it/s]
100%|██████████| 113/113 [00:00<00:00, 36017.66it/s]


Cluster 1: 40 data points


100%|██████████| 113/113 [00:00<00:00, 9941.61it/s]
100%|██████████| 113/113 [00:00<00:00, 9856.84it/s]
100%|██████████| 113/113 [00:00<00:00, 14082.37it/s]
100%|██████████| 113/113 [00:00<00:00, 36712.34it/s]


Cluster 2: 28 data points


100%|██████████| 113/113 [00:00<00:00, 9691.37it/s]
100%|██████████| 113/113 [00:00<00:00, 9793.09it/s]
100%|██████████| 113/113 [00:00<00:00, 13884.76it/s]
100%|██████████| 113/113 [00:00<00:00, 32763.47it/s]


Cluster 3: 23 data points


100%|██████████| 113/113 [00:00<00:00, 9853.56it/s]
100%|██████████| 113/113 [00:00<00:00, 9460.58it/s]
100%|██████████| 113/113 [00:00<00:00, 14054.39it/s]
100%|██████████| 113/113 [00:00<00:00, 35746.01it/s]


Cluster 4: 12 data points


100%|██████████| 113/113 [00:00<00:00, 9528.10it/s]
100%|██████████| 113/113 [00:00<00:00, 9691.57it/s]
100%|██████████| 113/113 [00:00<00:00, 14234.21it/s]
100%|██████████| 113/113 [00:00<00:00, 36379.82it/s]


curated_file is: mor_antag_G_GTP_Ki_curated.csv



100%|██████████| 61/61 [00:00<00:00, 17724.46it/s]


Cluster 0: 10 data points


100%|██████████| 61/61 [00:00<00:00, 10881.79it/s]
100%|██████████| 61/61 [00:00<00:00, 11078.75it/s]
100%|██████████| 61/61 [00:00<00:00, 19927.76it/s]
100%|██████████| 61/61 [00:00<00:00, 47885.56it/s]


Cluster 1: 27 data points


100%|██████████| 61/61 [00:00<00:00, 11289.44it/s]
100%|██████████| 61/61 [00:00<00:00, 17771.24it/s]
100%|██████████| 61/61 [00:00<00:00, 28786.29it/s]
100%|██████████| 61/61 [00:00<00:00, 63204.68it/s]


Cluster 2: 6 data points


100%|██████████| 61/61 [00:00<00:00, 18126.29it/s]
100%|██████████| 61/61 [00:00<00:00, 17934.43it/s]
100%|██████████| 61/61 [00:00<00:00, 28590.07it/s]
100%|██████████| 61/61 [00:00<00:00, 64674.56it/s]


Cluster 3: 10 data points


100%|██████████| 61/61 [00:00<00:00, 18273.88it/s]
100%|██████████| 61/61 [00:00<00:00, 17554.21it/s]
100%|██████████| 61/61 [00:00<00:00, 28808.98it/s]
100%|██████████| 61/61 [00:00<00:00, 64268.41it/s]


Cluster 4: 8 data points


100%|██████████| 61/61 [00:00<00:00, 18352.52it/s]
100%|██████████| 61/61 [00:00<00:00, 17734.29it/s]
100%|██████████| 61/61 [00:00<00:00, 28903.36it/s]
100%|██████████| 61/61 [00:00<00:00, 62220.95it/s]


curated_file is: kor_agon_G_Ca_EC50_curated.csv



100%|██████████| 74/74 [00:00<00:00, 14744.12it/s]


Cluster 0: 5 data points


100%|██████████| 74/74 [00:00<00:00, 9157.33it/s]
100%|██████████| 74/74 [00:00<00:00, 9558.05it/s]
100%|██████████| 74/74 [00:00<00:00, 16344.31it/s]
100%|██████████| 74/74 [00:00<00:00, 40855.40it/s]


An error occurred while stratifying data based on cliff molecules. Skipping stratification.
Cluster 1: 16 data points


100%|██████████| 74/74 [00:00<00:00, 9474.31it/s]
100%|██████████| 74/74 [00:00<00:00, 14709.88it/s]
100%|██████████| 74/74 [00:00<00:00, 21894.65it/s]
100%|██████████| 74/74 [00:00<00:00, 52713.74it/s]


Cluster 2: 45 data points


100%|██████████| 74/74 [00:00<00:00, 14934.25it/s]
100%|██████████| 74/74 [00:00<00:00, 14714.07it/s]
100%|██████████| 74/74 [00:00<00:00, 21928.68it/s]
100%|██████████| 74/74 [00:00<00:00, 54290.45it/s]


Cluster 3: 2 data points


100%|██████████| 74/74 [00:00<00:00, 14844.25it/s]
100%|██████████| 74/74 [00:00<00:00, 13689.34it/s]
100%|██████████| 74/74 [00:00<00:00, 21599.06it/s]
100%|██████████| 74/74 [00:00<00:00, 55743.26it/s]


Cluster 4: 6 data points


100%|██████████| 74/74 [00:00<00:00, 14815.20it/s]
100%|██████████| 74/74 [00:00<00:00, 14789.79it/s]
100%|██████████| 74/74 [00:00<00:00, 21606.58it/s]
100%|██████████| 74/74 [00:00<00:00, 54759.79it/s]


curated_file is: dor_antag_G_GTP_IC50_curated.csv



100%|██████████| 151/151 [00:00<00:00, 7359.54it/s]


Cluster 0: 49 data points


100%|██████████| 151/151 [00:00<00:00, 4830.30it/s]
100%|██████████| 151/151 [00:00<00:00, 7236.27it/s]
100%|██████████| 151/151 [00:00<00:00, 9538.97it/s]
100%|██████████| 151/151 [00:00<00:00, 27135.39it/s]


Cluster 1: 35 data points


100%|██████████| 151/151 [00:00<00:00, 7469.34it/s]
100%|██████████| 151/151 [00:00<00:00, 7360.57it/s]
100%|██████████| 151/151 [00:00<00:00, 9401.34it/s]
100%|██████████| 151/151 [00:00<00:00, 27260.36it/s]


Cluster 2: 15 data points


100%|██████████| 151/151 [00:00<00:00, 7363.05it/s]
100%|██████████| 151/151 [00:00<00:00, 7387.35it/s]
100%|██████████| 151/151 [00:00<00:00, 9517.47it/s]
100%|██████████| 151/151 [00:00<00:00, 26463.04it/s]


Cluster 3: 34 data points


100%|██████████| 151/151 [00:00<00:00, 7369.30it/s]
100%|██████████| 151/151 [00:00<00:00, 7265.74it/s]
100%|██████████| 151/151 [00:00<00:00, 9458.06it/s]
100%|██████████| 151/151 [00:00<00:00, 27167.98it/s]


Cluster 4: 18 data points


100%|██████████| 151/151 [00:00<00:00, 7307.32it/s]
100%|██████████| 151/151 [00:00<00:00, 7311.54it/s]
100%|██████████| 151/151 [00:00<00:00, 9462.01it/s]
100%|██████████| 151/151 [00:00<00:00, 27241.60it/s]


curated_file is: mor_agon_B_arrest_EC50_curated.csv



100%|██████████| 169/169 [00:00<00:00, 6625.14it/s]


Cluster 0: 45 data points


100%|██████████| 169/169 [00:00<00:00, 4275.75it/s]
100%|██████████| 169/169 [00:00<00:00, 6482.70it/s]
100%|██████████| 169/169 [00:00<00:00, 11078.87it/s]
100%|██████████| 169/169 [00:00<00:00, 24744.72it/s]


Cluster 1: 47 data points


100%|██████████| 169/169 [00:00<00:00, 6561.36it/s]
100%|██████████| 169/169 [00:00<00:00, 6525.61it/s]
100%|██████████| 169/169 [00:00<00:00, 11063.66it/s]
100%|██████████| 169/169 [00:00<00:00, 24346.96it/s]


Cluster 2: 46 data points


100%|██████████| 169/169 [00:00<00:00, 6508.83it/s]
100%|██████████| 169/169 [00:00<00:00, 6547.91it/s]
100%|██████████| 169/169 [00:00<00:00, 10824.09it/s]
100%|██████████| 169/169 [00:00<00:00, 24151.19it/s]


Cluster 3: 11 data points


100%|██████████| 169/169 [00:00<00:00, 6525.12it/s]
100%|██████████| 169/169 [00:00<00:00, 6473.05it/s]
100%|██████████| 169/169 [00:00<00:00, 10958.30it/s]
100%|██████████| 169/169 [00:00<00:00, 24430.03it/s]


Cluster 4: 20 data points


100%|██████████| 169/169 [00:00<00:00, 6572.56it/s]
100%|██████████| 169/169 [00:00<00:00, 6500.05it/s]
100%|██████████| 169/169 [00:00<00:00, 11118.49it/s]
100%|██████████| 169/169 [00:00<00:00, 24331.08it/s]


curated_file is: kor_bind_RBA_IC50_curated.csv



100%|██████████| 408/408 [00:00<00:00, 2733.62it/s]


Cluster 0: 68 data points


100%|██████████| 408/408 [00:00<00:00, 2354.97it/s]
100%|██████████| 408/408 [00:00<00:00, 2700.34it/s]
100%|██████████| 408/408 [00:00<00:00, 4507.07it/s]
100%|██████████| 408/408 [00:00<00:00, 10066.09it/s]


Cluster 1: 218 data points


100%|██████████| 408/408 [00:00<00:00, 2712.30it/s]
100%|██████████| 408/408 [00:00<00:00, 2661.06it/s]
100%|██████████| 408/408 [00:00<00:00, 4437.53it/s]
100%|██████████| 408/408 [00:00<00:00, 10009.86it/s]


Cluster 2: 20 data points


100%|██████████| 408/408 [00:00<00:00, 2726.82it/s]
100%|██████████| 408/408 [00:00<00:00, 2688.50it/s]
100%|██████████| 408/408 [00:00<00:00, 4426.74it/s]
100%|██████████| 408/408 [00:00<00:00, 9920.56it/s]


Cluster 3: 53 data points


100%|██████████| 408/408 [00:00<00:00, 2741.62it/s]
100%|██████████| 408/408 [00:00<00:00, 2681.12it/s]
100%|██████████| 408/408 [00:00<00:00, 4478.54it/s]
100%|██████████| 408/408 [00:00<00:00, 10054.32it/s]


Cluster 4: 49 data points


100%|██████████| 408/408 [00:00<00:00, 2736.59it/s]
100%|██████████| 408/408 [00:00<00:00, 2689.84it/s]
100%|██████████| 408/408 [00:00<00:00, 4468.61it/s]
100%|██████████| 408/408 [00:00<00:00, 10013.26it/s]


curated_file is: kor_agon_G_cAMP_EC50_curated.csv



100%|██████████| 237/237 [00:00<00:00, 4744.82it/s]


Cluster 0: 59 data points


100%|██████████| 237/237 [00:00<00:00, 3075.17it/s]
100%|██████████| 237/237 [00:00<00:00, 4696.62it/s]
100%|██████████| 237/237 [00:00<00:00, 4767.37it/s]
100%|██████████| 237/237 [00:00<00:00, 17570.48it/s]


Cluster 1: 63 data points


100%|██████████| 237/237 [00:00<00:00, 4804.40it/s]
100%|██████████| 237/237 [00:00<00:00, 4636.06it/s]
100%|██████████| 237/237 [00:00<00:00, 4802.92it/s]
100%|██████████| 237/237 [00:00<00:00, 17449.27it/s]


Cluster 2: 68 data points


100%|██████████| 237/237 [00:00<00:00, 4783.50it/s]
100%|██████████| 237/237 [00:00<00:00, 4643.97it/s]
100%|██████████| 237/237 [00:00<00:00, 4759.68it/s]
100%|██████████| 237/237 [00:00<00:00, 17450.50it/s]


Cluster 3: 30 data points


100%|██████████| 237/237 [00:00<00:00, 4764.88it/s]
100%|██████████| 237/237 [00:00<00:00, 4605.96it/s]
100%|██████████| 237/237 [00:00<00:00, 4773.37it/s]
100%|██████████| 237/237 [00:00<00:00, 17068.17it/s]


Cluster 4: 17 data points


100%|██████████| 237/237 [00:00<00:00, 4728.26it/s]
100%|██████████| 237/237 [00:00<00:00, 4692.06it/s]
100%|██████████| 237/237 [00:00<00:00, 4837.89it/s]
100%|██████████| 237/237 [00:00<00:00, 17424.50it/s]


curated_file is: kor_agon_G_GTP_EC50_curated.csv



100%|██████████| 1022/1022 [00:00<00:00, 1076.29it/s]


Cluster 0: 359 data points


100%|██████████| 1022/1022 [00:00<00:00, 1076.74it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1057.21it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1272.46it/s]
100%|██████████| 1022/1022 [00:00<00:00, 3977.53it/s]


Cluster 1: 233 data points


100%|██████████| 1022/1022 [00:00<00:00, 1073.97it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1058.76it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1264.49it/s]
100%|██████████| 1022/1022 [00:00<00:00, 3966.88it/s]


Cluster 2: 112 data points


100%|██████████| 1022/1022 [00:00<00:00, 1071.15it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1053.33it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1274.18it/s]
100%|██████████| 1022/1022 [00:00<00:00, 3976.10it/s]


Cluster 3: 88 data points


100%|██████████| 1022/1022 [00:00<00:00, 1075.92it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1074.67it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1277.70it/s]
100%|██████████| 1022/1022 [00:00<00:00, 3992.29it/s]


Cluster 4: 230 data points


100%|██████████| 1022/1022 [00:00<00:00, 1077.16it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1066.33it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1277.71it/s]
100%|██████████| 1022/1022 [00:00<00:00, 3964.77it/s]


curated_file is: nor_agon_G_GTP_EC50_curated.csv



100%|██████████| 190/190 [00:00<00:00, 5887.66it/s]


Cluster 0: 26 data points


100%|██████████| 190/190 [00:00<00:00, 3940.12it/s]
100%|██████████| 190/190 [00:00<00:00, 5947.64it/s]
100%|██████████| 190/190 [00:00<00:00, 9902.55it/s]
100%|██████████| 190/190 [00:00<00:00, 21730.96it/s]


Cluster 1: 40 data points


100%|██████████| 190/190 [00:00<00:00, 5904.19it/s]
100%|██████████| 190/190 [00:00<00:00, 5883.18it/s]
100%|██████████| 190/190 [00:00<00:00, 9915.00it/s]
100%|██████████| 190/190 [00:00<00:00, 21723.26it/s]


Cluster 2: 61 data points


100%|██████████| 190/190 [00:00<00:00, 5895.80it/s]
100%|██████████| 190/190 [00:00<00:00, 5902.61it/s]
100%|██████████| 190/190 [00:00<00:00, 9952.39it/s]
100%|██████████| 190/190 [00:00<00:00, 21853.72it/s]


Cluster 3: 43 data points


100%|██████████| 190/190 [00:00<00:00, 5947.32it/s]
100%|██████████| 190/190 [00:00<00:00, 5894.19it/s]
100%|██████████| 190/190 [00:00<00:00, 9899.97it/s]
100%|██████████| 190/190 [00:00<00:00, 21922.25it/s]


Cluster 4: 20 data points


100%|██████████| 190/190 [00:00<00:00, 5916.94it/s]
100%|██████████| 190/190 [00:00<00:00, 5825.46it/s]
100%|██████████| 190/190 [00:00<00:00, 9978.31it/s]
100%|██████████| 190/190 [00:00<00:00, 21877.72it/s]


curated_file is: mor_bind_RBA_IC50_curated.csv



100%|██████████| 561/561 [00:00<00:00, 1983.74it/s]


Cluster 0: 20 data points


100%|██████████| 561/561 [00:00<00:00, 1964.61it/s]
100%|██████████| 561/561 [00:00<00:00, 1938.76it/s]
100%|██████████| 561/561 [00:00<00:00, 3265.88it/s]
100%|██████████| 561/561 [00:00<00:00, 7330.03it/s]


Cluster 1: 77 data points


100%|██████████| 561/561 [00:00<00:00, 1992.01it/s]
100%|██████████| 561/561 [00:00<00:00, 1993.67it/s]
100%|██████████| 561/561 [00:00<00:00, 3305.27it/s]
100%|██████████| 561/561 [00:00<00:00, 7381.79it/s]


Cluster 2: 101 data points


100%|██████████| 561/561 [00:00<00:00, 1990.97it/s]
100%|██████████| 561/561 [00:00<00:00, 1994.34it/s]
100%|██████████| 561/561 [00:00<00:00, 3274.18it/s]
100%|██████████| 561/561 [00:00<00:00, 7366.24it/s]


Cluster 3: 322 data points


100%|██████████| 561/561 [00:00<00:00, 1985.65it/s]
100%|██████████| 561/561 [00:00<00:00, 1973.45it/s]
100%|██████████| 561/561 [00:00<00:00, 3322.13it/s]
100%|██████████| 561/561 [00:00<00:00, 7399.64it/s]


Cluster 4: 41 data points


100%|██████████| 561/561 [00:00<00:00, 1989.47it/s]
100%|██████████| 561/561 [00:00<00:00, 1985.64it/s]
100%|██████████| 561/561 [00:00<00:00, 3289.39it/s]
100%|██████████| 561/561 [00:00<00:00, 7338.37it/s]


curated_file is: kor_antag_G_GTP_IC50_curated.csv



100%|██████████| 133/133 [00:00<00:00, 8371.49it/s]


Cluster 0: 78 data points


100%|██████████| 133/133 [00:00<00:00, 5520.24it/s]
100%|██████████| 133/133 [00:00<00:00, 8096.52it/s]
100%|██████████| 133/133 [00:00<00:00, 13157.59it/s]
100%|██████████| 133/133 [00:00<00:00, 31365.89it/s]


Cluster 1: 19 data points


100%|██████████| 133/133 [00:00<00:00, 8401.37it/s]
100%|██████████| 133/133 [00:00<00:00, 8077.30it/s]
100%|██████████| 133/133 [00:00<00:00, 13378.80it/s]
100%|██████████| 133/133 [00:00<00:00, 31279.71it/s]


Cluster 2: 5 data points


100%|██████████| 133/133 [00:00<00:00, 8299.37it/s]
100%|██████████| 133/133 [00:00<00:00, 8453.05it/s]
100%|██████████| 133/133 [00:00<00:00, 13338.49it/s]
100%|██████████| 133/133 [00:00<00:00, 31255.18it/s]


Cluster 3: 16 data points


100%|██████████| 133/133 [00:00<00:00, 8486.23it/s]
100%|██████████| 133/133 [00:00<00:00, 8248.57it/s]
100%|██████████| 133/133 [00:00<00:00, 13309.85it/s]
100%|██████████| 133/133 [00:00<00:00, 31074.11it/s]


Cluster 4: 15 data points


100%|██████████| 133/133 [00:00<00:00, 8295.92it/s]
100%|██████████| 133/133 [00:00<00:00, 8185.63it/s]
100%|██████████| 133/133 [00:00<00:00, 13338.17it/s]
100%|██████████| 133/133 [00:00<00:00, 31037.80it/s]


curated_file is: nor_agon_G_cAMP_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: nor_bind_RBA_IC50_curated.csv



100%|██████████| 417/417 [00:00<00:00, 2662.17it/s]


Cluster 0: 201 data points


100%|██████████| 417/417 [00:00<00:00, 2355.13it/s]
100%|██████████| 417/417 [00:00<00:00, 2617.98it/s]
100%|██████████| 417/417 [00:00<00:00, 4685.86it/s]
100%|██████████| 417/417 [00:00<00:00, 9882.95it/s]


Cluster 1: 46 data points


100%|██████████| 417/417 [00:00<00:00, 2681.87it/s]
100%|██████████| 417/417 [00:00<00:00, 2647.15it/s]
100%|██████████| 417/417 [00:00<00:00, 4631.12it/s]
100%|██████████| 417/417 [00:00<00:00, 9780.81it/s]


Cluster 2: 33 data points


100%|██████████| 417/417 [00:00<00:00, 2656.26it/s]
100%|██████████| 417/417 [00:00<00:00, 2679.07it/s]
100%|██████████| 417/417 [00:00<00:00, 4664.53it/s]
100%|██████████| 417/417 [00:00<00:00, 9918.99it/s]


Cluster 3: 75 data points


100%|██████████| 417/417 [00:00<00:00, 2665.06it/s]
100%|██████████| 417/417 [00:00<00:00, 2667.68it/s]
100%|██████████| 417/417 [00:00<00:00, 4674.79it/s]
100%|██████████| 417/417 [00:00<00:00, 9939.73it/s]


Cluster 4: 62 data points


100%|██████████| 417/417 [00:00<00:00, 2659.81it/s]
100%|██████████| 417/417 [00:00<00:00, 2624.97it/s]
100%|██████████| 417/417 [00:00<00:00, 4701.88it/s]
100%|██████████| 417/417 [00:00<00:00, 9959.71it/s]


curated_file is: nor_agon_G_cAMP_IC50_curated.csv



100%|██████████| 59/59 [00:00<00:00, 17989.53it/s]


Cluster 0: 27 data points


100%|██████████| 59/59 [00:00<00:00, 11385.50it/s]
100%|██████████| 59/59 [00:00<00:00, 12333.11it/s]
100%|██████████| 59/59 [00:00<00:00, 22356.49it/s]
100%|██████████| 59/59 [00:00<00:00, 38077.23it/s]

Cluster 1: 7 data points



100%|██████████| 59/59 [00:00<00:00, 11736.49it/s]
100%|██████████| 59/59 [00:00<00:00, 18226.70it/s]
100%|██████████| 59/59 [00:00<00:00, 32221.87it/s]
100%|██████████| 59/59 [00:00<00:00, 63144.66it/s]


Cluster 2: 6 data points


100%|██████████| 59/59 [00:00<00:00, 18408.39it/s]
100%|██████████| 59/59 [00:00<00:00, 18500.59it/s]
100%|██████████| 59/59 [00:00<00:00, 32213.48it/s]
100%|██████████| 59/59 [00:00<00:00, 61223.14it/s]


Cluster 3: 7 data points


100%|██████████| 59/59 [00:00<00:00, 18501.98it/s]
100%|██████████| 59/59 [00:00<00:00, 18123.92it/s]
100%|██████████| 59/59 [00:00<00:00, 32179.97it/s]
100%|██████████| 59/59 [00:00<00:00, 65588.11it/s]


Cluster 4: 12 data points


100%|██████████| 59/59 [00:00<00:00, 18098.73it/s]
100%|██████████| 59/59 [00:00<00:00, 18265.72it/s]
100%|██████████| 59/59 [00:00<00:00, 31495.98it/s]
100%|██████████| 59/59 [00:00<00:00, 64043.46it/s]


curated_file is: dor_agon_B_arrest_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_agon_G_Ca_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: mor_bind_RBA_Ki_curated.csv



100%|██████████| 4523/4523 [00:18<00:00, 243.18it/s] 


Cluster 0: 514 data points


100%|██████████| 4523/4523 [00:18<00:00, 242.97it/s] 
[11:31:20] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 4523/4523 [00:18<00:00, 244.89it/s] 
100%|██████████| 4523/4523 [00:12<00:00, 350.68it/s] 
100%|██████████| 4523/4523 [00:05<00:00, 900.58it/s] 


Cluster 1: 225 data points


100%|██████████| 4523/4523 [00:18<00:00, 245.03it/s] 
[11:32:21] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 4523/4523 [00:18<00:00, 244.67it/s] 
100%|██████████| 4523/4523 [00:12<00:00, 350.87it/s] 
100%|██████████| 4523/4523 [00:04<00:00, 905.01it/s] 


Cluster 2: 1200 data points


100%|██████████| 4523/4523 [00:18<00:00, 244.43it/s] 
[11:33:21] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 4523/4523 [00:18<00:00, 242.26it/s] 
100%|██████████| 4523/4523 [00:12<00:00, 350.76it/s] 
100%|██████████| 4523/4523 [00:05<00:00, 904.42it/s] 


Cluster 3: 2437 data points


100%|██████████| 4523/4523 [00:18<00:00, 243.88it/s] 
[11:34:22] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 4523/4523 [00:18<00:00, 243.92it/s] 
100%|██████████| 4523/4523 [00:12<00:00, 352.82it/s] 
100%|██████████| 4523/4523 [00:04<00:00, 906.10it/s] 


Cluster 4: 147 data points


100%|██████████| 4523/4523 [00:18<00:00, 244.51it/s] 
[11:35:22] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 4523/4523 [00:18<00:00, 244.07it/s] 
100%|██████████| 4523/4523 [00:12<00:00, 353.63it/s] 
100%|██████████| 4523/4523 [00:04<00:00, 907.86it/s] 


curated_file is: dor_agon_G_GTP_EC50_curated.csv



100%|██████████| 581/581 [00:00<00:00, 1919.31it/s]


Cluster 0: 201 data points


100%|██████████| 581/581 [00:00<00:00, 1840.11it/s]
100%|██████████| 581/581 [00:00<00:00, 1861.20it/s]
100%|██████████| 581/581 [00:00<00:00, 2490.86it/s]
100%|██████████| 581/581 [00:00<00:00, 7059.60it/s]


Cluster 1: 114 data points


100%|██████████| 581/581 [00:00<00:00, 1870.14it/s]
100%|██████████| 581/581 [00:00<00:00, 1869.00it/s]
100%|██████████| 581/581 [00:00<00:00, 2489.40it/s]
100%|██████████| 581/581 [00:00<00:00, 7159.43it/s]


Cluster 2: 39 data points


100%|██████████| 581/581 [00:00<00:00, 1914.11it/s]
100%|██████████| 581/581 [00:00<00:00, 1872.64it/s]
100%|██████████| 581/581 [00:00<00:00, 2495.52it/s]
100%|██████████| 581/581 [00:00<00:00, 7040.76it/s]


Cluster 3: 89 data points


100%|██████████| 581/581 [00:00<00:00, 1902.53it/s]
100%|██████████| 581/581 [00:00<00:00, 1898.24it/s]
100%|██████████| 581/581 [00:00<00:00, 2494.29it/s]
100%|██████████| 581/581 [00:00<00:00, 7007.35it/s]


Cluster 4: 138 data points


100%|██████████| 581/581 [00:00<00:00, 1898.01it/s]
100%|██████████| 581/581 [00:00<00:00, 1873.95it/s]
100%|██████████| 581/581 [00:00<00:00, 2490.90it/s]
100%|██████████| 581/581 [00:00<00:00, 7056.89it/s]


curated_file is: mor_agon_G_Ca_EC50_curated.csv



100%|██████████| 144/144 [00:00<00:00, 7722.74it/s]


Cluster 0: 24 data points


100%|██████████| 144/144 [00:00<00:00, 5104.28it/s]
100%|██████████| 144/144 [00:00<00:00, 7585.87it/s]
100%|██████████| 144/144 [00:00<00:00, 12122.75it/s]
100%|██████████| 144/144 [00:00<00:00, 28232.59it/s]


Cluster 1: 25 data points


100%|██████████| 144/144 [00:00<00:00, 7734.41it/s]
100%|██████████| 144/144 [00:00<00:00, 7720.07it/s]
100%|██████████| 144/144 [00:00<00:00, 12090.72it/s]
100%|██████████| 144/144 [00:00<00:00, 27696.60it/s]


Cluster 2: 26 data points


100%|██████████| 144/144 [00:00<00:00, 7685.00it/s]
100%|██████████| 144/144 [00:00<00:00, 7455.25it/s]
100%|██████████| 144/144 [00:00<00:00, 12058.37it/s]
100%|██████████| 144/144 [00:00<00:00, 28713.09it/s]


Cluster 3: 35 data points


100%|██████████| 144/144 [00:00<00:00, 7671.44it/s]
100%|██████████| 144/144 [00:00<00:00, 7555.60it/s]
100%|██████████| 144/144 [00:00<00:00, 11946.75it/s]
100%|██████████| 144/144 [00:00<00:00, 28870.93it/s]


Cluster 4: 34 data points


100%|██████████| 144/144 [00:00<00:00, 7486.95it/s]
100%|██████████| 144/144 [00:00<00:00, 7510.60it/s]
100%|██████████| 144/144 [00:00<00:00, 11831.37it/s]
100%|██████████| 144/144 [00:00<00:00, 28607.01it/s]


curated_file is: kor_antag_G_GTP_Ki_curated.csv



100%|██████████| 53/53 [00:00<00:00, 20351.38it/s]


Cluster 0: 12 data points


100%|██████████| 53/53 [00:00<00:00, 13102.56it/s]
100%|██████████| 53/53 [00:00<00:00, 12791.19it/s]
100%|██████████| 53/53 [00:00<00:00, 23749.80it/s]
100%|██████████| 53/53 [00:00<00:00, 43223.43it/s]


Cluster 1: 3 data points


100%|██████████| 53/53 [00:00<00:00, 13427.04it/s]
100%|██████████| 53/53 [00:00<00:00, 13384.20it/s]
100%|██████████| 53/53 [00:00<00:00, 22995.56it/s]
100%|██████████| 53/53 [00:00<00:00, 40886.17it/s]

Cluster 2: 10 data points



100%|██████████| 53/53 [00:00<00:00, 20615.61it/s]
100%|██████████| 53/53 [00:00<00:00, 20254.95it/s]
100%|██████████| 53/53 [00:00<00:00, 32797.01it/s]
100%|██████████| 53/53 [00:00<00:00, 73244.85it/s]


Cluster 3: 10 data points


100%|██████████| 53/53 [00:00<00:00, 20439.33it/s]
100%|██████████| 53/53 [00:00<00:00, 19974.67it/s]
100%|██████████| 53/53 [00:00<00:00, 32996.60it/s]
100%|██████████| 53/53 [00:00<00:00, 72433.40it/s]


Cluster 4: 18 data points


100%|██████████| 53/53 [00:00<00:00, 19709.03it/s]
100%|██████████| 53/53 [00:00<00:00, 20566.02it/s]
100%|██████████| 53/53 [00:00<00:00, 33168.92it/s]
100%|██████████| 53/53 [00:00<00:00, 72221.61it/s]


curated_file is: mor_antag_G_GTP_IC50_curated.csv



100%|██████████| 268/268 [00:00<00:00, 4242.56it/s]


Cluster 0: 58 data points


100%|██████████| 268/268 [00:00<00:00, 2889.92it/s]
100%|██████████| 268/268 [00:00<00:00, 4130.72it/s]
100%|██████████| 268/268 [00:00<00:00, 6458.37it/s]
100%|██████████| 268/268 [00:00<00:00, 15250.57it/s]


Cluster 1: 50 data points


100%|██████████| 268/268 [00:00<00:00, 4165.36it/s]
100%|██████████| 268/268 [00:00<00:00, 4087.20it/s]
100%|██████████| 268/268 [00:00<00:00, 6380.58it/s]
100%|██████████| 268/268 [00:00<00:00, 15278.97it/s]


Cluster 2: 75 data points


100%|██████████| 268/268 [00:00<00:00, 4192.85it/s]
100%|██████████| 268/268 [00:00<00:00, 4071.45it/s]
100%|██████████| 268/268 [00:00<00:00, 6393.79it/s]
100%|██████████| 268/268 [00:00<00:00, 15474.37it/s]


Cluster 3: 52 data points


100%|██████████| 268/268 [00:00<00:00, 4165.61it/s]
100%|██████████| 268/268 [00:00<00:00, 4095.02it/s]
100%|██████████| 268/268 [00:00<00:00, 6386.74it/s]
100%|██████████| 268/268 [00:00<00:00, 15385.20it/s]


Cluster 4: 33 data points


100%|██████████| 268/268 [00:00<00:00, 4113.49it/s]
100%|██████████| 268/268 [00:00<00:00, 4057.28it/s]
100%|██████████| 268/268 [00:00<00:00, 6242.77it/s]
100%|██████████| 268/268 [00:00<00:00, 14850.62it/s]


curated_file is: nor_bind_RBA_Ki_curated.csv



100%|██████████| 1126/1126 [00:01<00:00, 989.27it/s]


Cluster 0: 85 data points


100%|██████████| 1126/1126 [00:01<00:00, 987.18it/s]
100%|██████████| 1126/1126 [00:01<00:00, 962.10it/s]
100%|██████████| 1126/1126 [00:00<00:00, 1670.21it/s]
100%|██████████| 1126/1126 [00:00<00:00, 3638.36it/s]


Cluster 1: 510 data points


100%|██████████| 1126/1126 [00:01<00:00, 984.71it/s]
100%|██████████| 1126/1126 [00:01<00:00, 961.69it/s]
100%|██████████| 1126/1126 [00:00<00:00, 1676.18it/s]
100%|██████████| 1126/1126 [00:00<00:00, 3642.19it/s]


Cluster 2: 245 data points


100%|██████████| 1126/1126 [00:01<00:00, 983.24it/s]
100%|██████████| 1126/1126 [00:01<00:00, 964.26it/s]
100%|██████████| 1126/1126 [00:00<00:00, 1658.68it/s]
100%|██████████| 1126/1126 [00:00<00:00, 3613.12it/s]


Cluster 3: 145 data points


100%|██████████| 1126/1126 [00:01<00:00, 985.25it/s]
100%|██████████| 1126/1126 [00:01<00:00, 963.52it/s]
100%|██████████| 1126/1126 [00:00<00:00, 1659.11it/s]
100%|██████████| 1126/1126 [00:00<00:00, 3638.63it/s]


Cluster 4: 141 data points


100%|██████████| 1126/1126 [00:01<00:00, 985.87it/s]
100%|██████████| 1126/1126 [00:01<00:00, 964.74it/s]
100%|██████████| 1126/1126 [00:00<00:00, 1701.45it/s]
100%|██████████| 1126/1126 [00:00<00:00, 3653.26it/s]


curated_file is: mor_agon_G_cAMP_EC50_curated.csv



100%|██████████| 418/418 [00:00<00:00, 2700.10it/s]


Cluster 0: 134 data points


100%|██████████| 418/418 [00:00<00:00, 2309.88it/s]
100%|██████████| 418/418 [00:00<00:00, 2636.63it/s]
100%|██████████| 418/418 [00:00<00:00, 4024.02it/s]
100%|██████████| 418/418 [00:00<00:00, 9754.52it/s]


Cluster 1: 57 data points


100%|██████████| 418/418 [00:00<00:00, 2653.76it/s]
100%|██████████| 418/418 [00:00<00:00, 2660.79it/s]
100%|██████████| 418/418 [00:00<00:00, 4017.95it/s]
100%|██████████| 418/418 [00:00<00:00, 9830.82it/s]


Cluster 2: 81 data points


100%|██████████| 418/418 [00:00<00:00, 2673.03it/s]
100%|██████████| 418/418 [00:00<00:00, 2691.65it/s]
100%|██████████| 418/418 [00:00<00:00, 4058.53it/s]
100%|██████████| 418/418 [00:00<00:00, 9834.08it/s]


Cluster 3: 60 data points


100%|██████████| 418/418 [00:00<00:00, 2671.66it/s]
100%|██████████| 418/418 [00:00<00:00, 2683.83it/s]
100%|██████████| 418/418 [00:00<00:00, 4044.44it/s]
100%|██████████| 418/418 [00:00<00:00, 9940.91it/s]


Cluster 4: 86 data points


100%|██████████| 418/418 [00:00<00:00, 2674.07it/s]
100%|██████████| 418/418 [00:00<00:00, 2700.66it/s]
100%|██████████| 418/418 [00:00<00:00, 4000.87it/s]
100%|██████████| 418/418 [00:00<00:00, 9906.70it/s]


curated_file is: kor_agon_B_arrest_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_bind_RBA_Ki_curated.csv



100%|██████████| 3923/3923 [00:13<00:00, 284.45it/s] 


Cluster 0: 445 data points


100%|██████████| 3923/3923 [00:13<00:00, 282.12it/s] 
[11:37:24] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 3923/3923 [00:13<00:00, 283.56it/s] 
100%|██████████| 3923/3923 [00:10<00:00, 383.24it/s] 
100%|██████████| 3923/3923 [00:03<00:00, 1039.07it/s]


Cluster 1: 2233 data points


100%|██████████| 3923/3923 [00:13<00:00, 283.09it/s] 
[11:38:11] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 3923/3923 [00:13<00:00, 281.67it/s] 
100%|██████████| 3923/3923 [00:10<00:00, 382.41it/s] 
100%|██████████| 3923/3923 [00:03<00:00, 1043.12it/s]


Cluster 2: 871 data points


100%|██████████| 3923/3923 [00:13<00:00, 280.64it/s] 
[11:38:57] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 3923/3923 [00:13<00:00, 281.00it/s] 
100%|██████████| 3923/3923 [00:10<00:00, 383.33it/s] 
100%|██████████| 3923/3923 [00:03<00:00, 1044.82it/s]


Cluster 3: 148 data points


100%|██████████| 3923/3923 [00:13<00:00, 283.46it/s] 
[11:39:44] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 3923/3923 [00:13<00:00, 280.74it/s] 
100%|██████████| 3923/3923 [00:10<00:00, 384.38it/s] 
100%|██████████| 3923/3923 [00:03<00:00, 1045.69it/s]


Cluster 4: 226 data points


100%|██████████| 3923/3923 [00:13<00:00, 282.13it/s] 
[11:40:31] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 3923/3923 [00:14<00:00, 279.77it/s] 
100%|██████████| 3923/3923 [00:10<00:00, 381.41it/s] 
100%|██████████| 3923/3923 [00:03<00:00, 1021.01it/s]


curated_file is: kor_bind_RBA_Ki_curated.csv



100%|██████████| 3759/3759 [00:12<00:00, 296.96it/s] 


Cluster 0: 398 data points


100%|██████████| 3759/3759 [00:12<00:00, 293.36it/s] 
[11:41:37] Explicit valence for atom # 32 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(C)c(-n2ccn3nc(-c4cccnc4)cc23)cc1NC(=O)c1cc(C#N)cc(S(F)(F)(F)(F)F)c1, then used a normal scaffold instead


100%|██████████| 3759/3759 [00:12<00:00, 295.60it/s] 
100%|██████████| 3759/3759 [00:09<00:00, 399.02it/s] 
100%|██████████| 3759/3759 [00:03<00:00, 1071.63it/s]


Cluster 1: 1858 data points


100%|██████████| 3759/3759 [00:12<00:00, 295.55it/s] 
[11:42:21] Explicit valence for atom # 32 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(C)c(-n2ccn3nc(-c4cccnc4)cc23)cc1NC(=O)c1cc(C#N)cc(S(F)(F)(F)(F)F)c1, then used a normal scaffold instead


100%|██████████| 3759/3759 [00:12<00:00, 294.58it/s] 
100%|██████████| 3759/3759 [00:09<00:00, 397.17it/s] 
100%|██████████| 3759/3759 [00:03<00:00, 1068.74it/s]


Cluster 2: 290 data points


100%|██████████| 3759/3759 [00:12<00:00, 294.51it/s] 
[11:43:04] Explicit valence for atom # 32 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(C)c(-n2ccn3nc(-c4cccnc4)cc23)cc1NC(=O)c1cc(C#N)cc(S(F)(F)(F)(F)F)c1, then used a normal scaffold instead


100%|██████████| 3759/3759 [00:12<00:00, 290.46it/s] 
100%|██████████| 3759/3759 [00:09<00:00, 398.96it/s] 
100%|██████████| 3759/3759 [00:03<00:00, 1064.98it/s]


Cluster 3: 898 data points


100%|██████████| 3759/3759 [00:12<00:00, 293.16it/s] 
[11:43:47] Explicit valence for atom # 32 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(C)c(-n2ccn3nc(-c4cccnc4)cc23)cc1NC(=O)c1cc(C#N)cc(S(F)(F)(F)(F)F)c1, then used a normal scaffold instead


100%|██████████| 3759/3759 [00:12<00:00, 292.76it/s] 
100%|██████████| 3759/3759 [00:09<00:00, 398.66it/s] 
100%|██████████| 3759/3759 [00:03<00:00, 1067.99it/s]


Cluster 4: 315 data points


100%|██████████| 3759/3759 [00:12<00:00, 295.39it/s] 
[11:44:30] Explicit valence for atom # 32 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(C)c(-n2ccn3nc(-c4cccnc4)cc23)cc1NC(=O)c1cc(C#N)cc(S(F)(F)(F)(F)F)c1, then used a normal scaffold instead


100%|██████████| 3759/3759 [00:12<00:00, 295.40it/s] 
100%|██████████| 3759/3759 [00:09<00:00, 399.48it/s] 
100%|██████████| 3759/3759 [00:03<00:00, 1066.04it/s]


curated_file is: nor_antag_G_GTP_Ki_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_bind_RBA_IC50_curated.csv



100%|██████████| 672/672 [00:00<00:00, 1645.12it/s]


Cluster 0: 297 data points


100%|██████████| 672/672 [00:00<00:00, 1549.35it/s]
100%|██████████| 672/672 [00:00<00:00, 1597.33it/s]
100%|██████████| 672/672 [00:00<00:00, 2459.06it/s]
100%|██████████| 672/672 [00:00<00:00, 5958.67it/s]


Cluster 1: 143 data points


100%|██████████| 672/672 [00:00<00:00, 1636.34it/s]
100%|██████████| 672/672 [00:00<00:00, 1601.14it/s]
100%|██████████| 672/672 [00:00<00:00, 2472.50it/s]
100%|██████████| 672/672 [00:00<00:00, 6013.57it/s]


Cluster 2: 87 data points


100%|██████████| 672/672 [00:00<00:00, 1647.12it/s]
100%|██████████| 672/672 [00:00<00:00, 1633.02it/s]
100%|██████████| 672/672 [00:00<00:00, 2454.62it/s]
100%|██████████| 672/672 [00:00<00:00, 5928.23it/s]


Cluster 3: 83 data points


100%|██████████| 672/672 [00:00<00:00, 1644.02it/s]
100%|██████████| 672/672 [00:00<00:00, 1603.21it/s]
100%|██████████| 672/672 [00:00<00:00, 2468.00it/s]
100%|██████████| 672/672 [00:00<00:00, 6041.26it/s]


Cluster 4: 62 data points


100%|██████████| 672/672 [00:00<00:00, 1648.86it/s]
100%|██████████| 672/672 [00:00<00:00, 1641.71it/s]
100%|██████████| 672/672 [00:00<00:00, 2479.21it/s]
100%|██████████| 672/672 [00:00<00:00, 5981.33it/s]


Done!

use_clustering is: False

curated_file is: nor_antag_G_GTP_IC50_curated.csv



UnboundLocalError: local variable 'cliff_mols' referenced before assignment