In [1]:
# conda env: pyg (Python3.9.16)
# --> 
# 1. improve it by adding multiprocessing feature
# 2. add scaffold split for classification task. Because now three spliting methods:
#    2.1 cluster split + stratified by cliff

import os
import sys
from typing import List, Union
import shutil

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.cluster import SpectralClustering

from datacat4ml.utils import mkdirs, get_df_name
from datacat4ml.const import *
from datacat4ml.Scripts.data_prep.data_split.split_utils.cliff import ActivityCliffs, get_tanimoto_matrix

# Split the categorized datasets

In [2]:
def data_spliter(smiles: List[str], pStandard_value: Union[List[float], np.array], activity: List[float],
                 similarity: float=0.9, potency_fold: int=1, 
                 use_clustering: bool=True, n_clusters: int=5, test_size: float=0.2, task: str='cls') -> pd.DataFrame:
    """
    Split the data into train and test sets according to activity cliffs and compound charateristics.

    :param smiles: List of SMILES strings
    :param pStandard_value: List of pStandard_values or np.array of pStandard_values
    :param activity: List of active/inactive labels, where 1 is active and 0 is inactive
    :param similarity: Threshold value to determine structural similarity
    :param potency_fold: Threshold value to determine potency difference, where the potency here is the pStandard_value
    :param use_clustering: Whether to use clustering to split the data
    :param n_clusters: Number of clusters to use if clustering is used
    :param test_size: Test set size

    :return: A dataframe 
    """

    if len(smiles) < 50:
        print(f"The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.")
        return None

    if use_clustering:
        # # cluster the dabaset into 5 clusters based on tanimoto distance matrix
        spectral = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', random_state=RANDOM_SEED)
        clusters = spectral.fit(get_tanimoto_matrix(smiles)).labels_ # get the cluster labels for each molecule

        train_idx, test_idx = [], []
        for cluster in range(n_clusters):
            num_in_cluster = len(np.where(clusters == cluster)[0])
            print(f"Cluster {cluster}: {num_in_cluster} data points")

            # get the indices of molecules in the current cluster
            cluster_idx = np.where(clusters == cluster)[0] # `[0]` is need to convert the tuple to a list

            if task == 'reg':
                # get activity cliffs
                cliffs = ActivityCliffs(smiles, pStandard_value)
                cliff_mols = cliffs.get_cliff_molecules(return_smiles=False, similarity=similarity, potency_fold=potency_fold)
                clust_cliff_mols = [cliff_mols[i] for i in cluster_idx]
            
                # can only split data stratified by cliff molecules if there are more than 1 cliff molecules in the cluster, else just split randomly
                if sum(clust_cliff_mols) > 2:
                    try: # try to avoid the Value ERROR when the number of class in the training set is less than 2
                        clust_train_idx, clust_test_idx = train_test_split(cluster_idx, test_size=test_size, 
                                                                            stratify=clust_cliff_mols, # ensure that the train/test split has the same proportion of cliff molecules
                                                                            random_state=RANDOM_SEED, shuffle=True)
                    except ValueError:
                        # This block will be executed if an error occurs in the try block
                        print("An error occurred while stratifying data based on cliff molecules. Skipping stratification.")
                        clust_train_idx, clust_test_idx = train_test_split(cluster_idx, test_size=test_size, 
                                                                            random_state=RANDOM_SEED, shuffle=True)
                else:
                    clust_train_idx, clust_test_idx = train_test_split(cluster_idx, test_size=test_size, 
                                                                        random_state=RANDOM_SEED, shuffle=True)

            elif task == 'cls':
                print("Classification task")
                clust_train_idx, clust_test_idx = train_test_split(cluster_idx, test_size=test_size, 
                                                                    random_state=RANDOM_SEED, shuffle=True)
                print(f'done splitting in task {task}')

            train_idx.extend(clust_train_idx)
            test_idx.extend(clust_test_idx)
            
    else:
        # don't use clustering before spliting. This is to avoid potential cheating by using clustering to split the data, 
        # which make the structual diversity of the train and test sets the same
        if task == 'reg':
            if sum(cliff_mols) > 2:
                train_idx, test_idx = train_test_split(range(len(smiles)), test_size=test_size, 
                                                stratify=cliff_mols, # ensure that the train/test split has the same proportion of cliff molecules
                                                random_state=RANDOM_SEED, shuffle=True)
            else:
                train_idx, test_idx = train_test_split(range(len(smiles)), test_size=test_size, 
                                                random_state=RANDOM_SEED, shuffle=True)
        elif task == 'cls':
            train_idx, test_idx = train_test_split(range(len(smiles)), test_size=test_size, 
                                                random_state=RANDOM_SEED, shuffle=True)
            
    train_test = []
    for i in range(len(smiles)):
        if i in train_idx:
            train_test.append('train')
        elif i in test_idx:
            train_test.append('test')
        else:
            raise ValueError('Index not in train or test set')
    
    active_ratio = sum(activity) / len(activity)
    active_ratio_train = sum([activity[i] for i in train_idx]) / len(train_idx)
    active_ratio_test = sum([activity[i] for i in test_idx]) / len(test_idx)
    
    if task == 'reg':
        return pd.DataFrame({'active_ratio': active_ratio,
                            'active_ratio_train': active_ratio_train,
                            'active_ratio_test': active_ratio_test,
                            'cliff_mol': cliff_mols,
                            'split': train_test})
    elif task == 'cls':
        return pd.DataFrame({'active_ratio': active_ratio,
                            'active_ratio_train': active_ratio_train,
                            'active_ratio_test': active_ratio_test,
                            'split': train_test})

In [3]:
def split_data(input_filepath = CURA_CAT_DATASETS_DIR, output_filepath = SPLIT_CAT_DATASETS_DIR,
               task:str = 'cls', use_clustering: bool=True):
    
    # access the final csv files obtained from data curation
    folder_path = os.path.join(input_filepath, task)
    files = os.listdir(folder_path)
    curated_files = [file for file in files if file.endswith('curated.csv')]

    # make new directory to store the featurized data
    output_path = os.path.join(output_filepath, task, 'use_clustering'+'_'+str(use_clustering))
    if os.path.exists(output_path):
        shutil.rmtree(output_path)
    mkdirs(output_path)

    for curated_file in curated_files:
        print (f"curated_file is: {curated_file}\n")

        df = pd.read_csv(os.path.join(folder_path, curated_file))
        df = df.drop(columns=['Unnamed: 0'])

        # split the data into train and test sets
        data_splited_df = data_spliter(df['canonical_smiles_by_Std'].tolist(), df['pStandard_value'].tolist(), df['activity'].tolist(), 
                                       use_clustering=use_clustering, task=task)
        
        # save data_splited_df as a csv file if it is not None
        if data_splited_df is not None:
            concat_df = pd.concat([df, data_splited_df], axis=1)
            concat_df.to_csv(os.path.join(output_path, f'{curated_file[:-11]}split.csv'), index=False)

In [None]:
use_clusterings = [True, False]

for task in Tasks:
    print (f"task is: {task}\n")
    
    for use_clustering in use_clusterings:
        print (f"use_clustering is: {use_clustering}\n")
        
        split_data(input_filepath = CURA_CAT_DATASETS_DIR, output_filepath = SPLIT_CAT_DATASETS_DIR, 
                   task=task, use_clustering=use_clustering)
        print(f'Done!\n====================================\n')
#It took around 17 mins to run the above code locally. 
# It's okay to encounter error for 'reg + use_clustering=False' because no calulcation for identifying cliff mols without using clustering.

task is: cls

use_clustering is: True

curated_file is: nor_antag_G_GTP_IC50_curated.csv



100%|██████████| 150/150 [00:00<00:00, 7141.67it/s]


Cluster 0: 32 data points
Classification task
done splitting in task cls
Cluster 1: 26 data points
Classification task
done splitting in task cls
Cluster 2: 44 data points
Classification task
done splitting in task cls
Cluster 3: 40 data points
Classification task
done splitting in task cls
Cluster 4: 8 data points
Classification task
done splitting in task cls
curated_file is: kor_antag_B_arrest_IC50_curated.csv



100%|██████████| 53/53 [00:00<00:00, 13094.08it/s]

Cluster 0: 18 data points
Classification task
done splitting in task cls
Cluster 1: 12 data points
Classification task
done splitting in task cls
Cluster 2: 6 data points
Classification task
done splitting in task cls
Cluster 3: 7 data points
Classification task
done splitting in task cls
Cluster 4: 10 data points
Classification task
done splitting in task cls
curated_file is: nor_agon_G_Ca_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_antag_B_arrest_IC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_agon_G_cAMP_IC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: kor_agon_G_cAMP_IC50_curated.csv

The number of data po


100%|██████████| 73/73 [00:00<00:00, 10089.11it/s]


Cluster 0: 19 data points
Classification task
done splitting in task cls
Cluster 1: 28 data points
Classification task
done splitting in task cls
Cluster 2: 10 data points
Classification task
done splitting in task cls
Cluster 3: 11 data points
Classification task
done splitting in task cls
Cluster 4: 5 data points
Classification task
done splitting in task cls
curated_file is: mor_antag_B_arrest_IC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: mor_agon_G_cAMP_IC50_curated.csv



100%|██████████| 55/55 [00:00<00:00, 18930.47it/s]


Cluster 0: 18 data points
Classification task
done splitting in task cls
Cluster 1: 18 data points
Classification task
done splitting in task cls
Cluster 2: 5 data points
Classification task
done splitting in task cls
Cluster 3: 6 data points
Classification task
done splitting in task cls
Cluster 4: 8 data points
Classification task
done splitting in task cls
curated_file is: mor_agon_G_GTP_EC50_curated.csv



100%|██████████| 980/980 [00:00<00:00, 1124.41it/s]


Cluster 0: 295 data points
Classification task
done splitting in task cls
Cluster 1: 242 data points
Classification task
done splitting in task cls
Cluster 2: 145 data points
Classification task
done splitting in task cls
Cluster 3: 222 data points
Classification task
done splitting in task cls
Cluster 4: 76 data points
Classification task
done splitting in task cls
curated_file is: dor_agon_G_cAMP_EC50_curated.csv



100%|██████████| 125/125 [00:00<00:00, 8740.17it/s]


Cluster 0: 28 data points
Classification task
done splitting in task cls
Cluster 1: 24 data points
Classification task
done splitting in task cls
Cluster 2: 18 data points
Classification task
done splitting in task cls
Cluster 3: 44 data points
Classification task
done splitting in task cls
Cluster 4: 11 data points
Classification task
done splitting in task cls
curated_file is: mor_antag_G_GTP_Ki_curated.csv



100%|██████████| 61/61 [00:00<00:00, 11314.90it/s]


Cluster 0: 10 data points
Classification task
done splitting in task cls
Cluster 1: 27 data points
Classification task
done splitting in task cls
Cluster 2: 6 data points
Classification task
done splitting in task cls
Cluster 3: 10 data points
Classification task
done splitting in task cls
Cluster 4: 8 data points
Classification task
done splitting in task cls
curated_file is: kor_agon_G_Ca_EC50_curated.csv



100%|██████████| 74/74 [00:00<00:00, 6336.45it/s]

Cluster 0: 5 data points
Classification task
done splitting in task cls
Cluster 1: 16 data points
Classification task
done splitting in task cls
Cluster 2: 45 data points
Classification task
done splitting in task cls
Cluster 3: 2 data points
Classification task
done splitting in task cls
Cluster 4: 6 data points
Classification task
done splitting in task cls
curated_file is: dor_antag_G_GTP_IC50_curated.csv




100%|██████████| 169/169 [00:00<00:00, 6524.76it/s]


Cluster 0: 54 data points
Classification task
done splitting in task cls
Cluster 1: 35 data points
Classification task
done splitting in task cls
Cluster 2: 18 data points
Classification task
done splitting in task cls
Cluster 3: 15 data points
Classification task
done splitting in task cls
Cluster 4: 47 data points
Classification task
done splitting in task cls
curated_file is: mor_agon_B_arrest_EC50_curated.csv



100%|██████████| 207/207 [00:00<00:00, 3623.87it/s]


Cluster 0: 47 data points
Classification task
done splitting in task cls
Cluster 1: 48 data points
Classification task
done splitting in task cls
Cluster 2: 45 data points
Classification task
done splitting in task cls
Cluster 3: 35 data points
Classification task
done splitting in task cls
Cluster 4: 32 data points
Classification task
done splitting in task cls
curated_file is: kor_bind_RBA_IC50_curated.csv



100%|██████████| 416/416 [00:00<00:00, 2559.63it/s]


Cluster 0: 53 data points
Classification task
done splitting in task cls
Cluster 1: 224 data points
Classification task
done splitting in task cls
Cluster 2: 70 data points
Classification task
done splitting in task cls
Cluster 3: 20 data points
Classification task
done splitting in task cls
Cluster 4: 49 data points
Classification task
done splitting in task cls
curated_file is: kor_agon_G_cAMP_EC50_curated.csv



100%|██████████| 253/253 [00:00<00:00, 3481.24it/s]


Cluster 0: 64 data points
Classification task
done splitting in task cls
Cluster 1: 39 data points
Classification task
done splitting in task cls
Cluster 2: 37 data points
Classification task
done splitting in task cls
Cluster 3: 65 data points
Classification task
done splitting in task cls
Cluster 4: 48 data points
Classification task
done splitting in task cls
curated_file is: kor_agon_G_GTP_EC50_curated.csv



100%|██████████| 1246/1246 [00:01<00:00, 882.58it/s] 


Cluster 0: 124 data points
Classification task
done splitting in task cls
Cluster 1: 212 data points
Classification task
done splitting in task cls
Cluster 2: 411 data points
Classification task
done splitting in task cls
Cluster 3: 311 data points
Classification task
done splitting in task cls
Cluster 4: 188 data points
Classification task
done splitting in task cls
curated_file is: nor_agon_G_GTP_EC50_curated.csv



100%|██████████| 222/222 [00:00<00:00, 5129.32it/s]


Cluster 0: 42 data points
Classification task
done splitting in task cls
Cluster 1: 66 data points
Classification task
done splitting in task cls
Cluster 2: 20 data points
Classification task
done splitting in task cls
Cluster 3: 63 data points
Classification task
done splitting in task cls
Cluster 4: 31 data points
Classification task
done splitting in task cls
curated_file is: mor_bind_RBA_IC50_curated.csv



100%|██████████| 582/582 [00:00<00:00, 1918.08it/s]


Cluster 0: 77 data points
Classification task
done splitting in task cls
Cluster 1: 100 data points
Classification task
done splitting in task cls
Cluster 2: 54 data points
Classification task
done splitting in task cls
Cluster 3: 20 data points
Classification task
done splitting in task cls
Cluster 4: 331 data points
Classification task
done splitting in task cls
curated_file is: kor_antag_G_GTP_IC50_curated.csv



100%|██████████| 185/185 [00:00<00:00, 4034.54it/s]


Cluster 0: 48 data points
Classification task
done splitting in task cls
Cluster 1: 82 data points
Classification task
done splitting in task cls
Cluster 2: 23 data points
Classification task
done splitting in task cls
Cluster 3: 16 data points
Classification task
done splitting in task cls
Cluster 4: 16 data points
Classification task
done splitting in task cls
curated_file is: nor_agon_G_cAMP_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: nor_bind_RBA_IC50_curated.csv



100%|██████████| 417/417 [00:00<00:00, 2696.11it/s]


Cluster 0: 201 data points
Classification task
done splitting in task cls
Cluster 1: 46 data points
Classification task
done splitting in task cls
Cluster 2: 33 data points
Classification task
done splitting in task cls
Cluster 3: 75 data points
Classification task
done splitting in task cls
Cluster 4: 62 data points
Classification task
done splitting in task cls
curated_file is: nor_agon_G_cAMP_IC50_curated.csv



100%|██████████| 69/69 [00:00<00:00, 10112.41it/s]


Cluster 0: 16 data points
Classification task
done splitting in task cls
Cluster 1: 8 data points
Classification task
done splitting in task cls
Cluster 2: 4 data points
Classification task
done splitting in task cls
Cluster 3: 5 data points
Classification task
done splitting in task cls
Cluster 4: 36 data points
Classification task
done splitting in task cls
curated_file is: dor_agon_B_arrest_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_agon_G_Ca_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: mor_bind_RBA_Ki_curated.csv



100%|██████████| 4654/4654 [00:20<00:00, 232.47it/s] 


Cluster 0: 2534 data points
Classification task
done splitting in task cls
Cluster 1: 1201 data points
Classification task
done splitting in task cls
Cluster 2: 149 data points
Classification task
done splitting in task cls
Cluster 3: 545 data points
Classification task
done splitting in task cls
Cluster 4: 225 data points
Classification task
done splitting in task cls
curated_file is: dor_agon_G_GTP_EC50_curated.csv



100%|██████████| 648/648 [00:00<00:00, 1677.49it/s]


Cluster 0: 115 data points
Classification task
done splitting in task cls
Cluster 1: 110 data points
Classification task
done splitting in task cls
Cluster 2: 224 data points
Classification task
done splitting in task cls
Cluster 3: 46 data points
Classification task
done splitting in task cls
Cluster 4: 153 data points
Classification task
done splitting in task cls
curated_file is: mor_agon_G_Ca_EC50_curated.csv



100%|██████████| 144/144 [00:00<00:00, 7326.56it/s]


Cluster 0: 24 data points
Classification task
done splitting in task cls
Cluster 1: 25 data points
Classification task
done splitting in task cls
Cluster 2: 26 data points
Classification task
done splitting in task cls
Cluster 3: 35 data points
Classification task
done splitting in task cls
Cluster 4: 34 data points
Classification task
done splitting in task cls
curated_file is: kor_antag_G_GTP_Ki_curated.csv



100%|██████████| 53/53 [00:00<00:00, 12668.00it/s]


Cluster 0: 12 data points
Classification task
done splitting in task cls
Cluster 1: 3 data points
Classification task
done splitting in task cls
Cluster 2: 10 data points
Classification task
done splitting in task cls
Cluster 3: 10 data points
Classification task
done splitting in task cls
Cluster 4: 18 data points
Classification task
done splitting in task cls
curated_file is: mor_antag_G_GTP_IC50_curated.csv



100%|██████████| 339/339 [00:00<00:00, 2690.94it/s]


Cluster 0: 71 data points
Classification task
done splitting in task cls
Cluster 1: 113 data points
Classification task
done splitting in task cls
Cluster 2: 56 data points
Classification task
done splitting in task cls
Cluster 3: 17 data points
Classification task
done splitting in task cls
Cluster 4: 82 data points
Classification task
done splitting in task cls
curated_file is: nor_bind_RBA_Ki_curated.csv



100%|██████████| 1142/1142 [00:01<00:00, 963.29it/s]


Cluster 0: 527 data points
Classification task
done splitting in task cls
Cluster 1: 245 data points
Classification task
done splitting in task cls
Cluster 2: 85 data points
Classification task
done splitting in task cls
Cluster 3: 141 data points
Classification task
done splitting in task cls
Cluster 4: 144 data points
Classification task
done splitting in task cls
curated_file is: mor_agon_G_cAMP_EC50_curated.csv



100%|██████████| 435/435 [00:00<00:00, 2520.20it/s]


Cluster 0: 110 data points
Classification task
done splitting in task cls
Cluster 1: 64 data points
Classification task
done splitting in task cls
Cluster 2: 57 data points
Classification task
done splitting in task cls
Cluster 3: 78 data points
Classification task
done splitting in task cls
Cluster 4: 126 data points
Classification task
done splitting in task cls
curated_file is: kor_agon_B_arrest_EC50_curated.csv



100%|██████████| 50/50 [00:00<00:00, 14032.47it/s]

Cluster 0: 4 data points
Classification task
done splitting in task cls
Cluster 1: 13 data points
Classification task
done splitting in task cls
Cluster 2: 9 data points
Classification task
done splitting in task cls
Cluster 3: 12 data points
Classification task
done splitting in task cls
Cluster 4: 12 data points
Classification task
done splitting in task cls
curated_file is: dor_bind_RBA_Ki_curated.csv




100%|██████████| 4035/4035 [00:14<00:00, 270.25it/s] 


Cluster 0: 226 data points
Classification task
done splitting in task cls
Cluster 1: 875 data points
Classification task
done splitting in task cls
Cluster 2: 2335 data points
Classification task
done splitting in task cls
Cluster 3: 150 data points
Classification task
done splitting in task cls
Cluster 4: 449 data points
Classification task
done splitting in task cls
curated_file is: kor_bind_RBA_Ki_curated.csv



100%|██████████| 3860/3860 [00:13<00:00, 284.11it/s] 


Cluster 0: 301 data points
Classification task
done splitting in task cls
Cluster 1: 1936 data points
Classification task
done splitting in task cls
Cluster 2: 318 data points
Classification task
done splitting in task cls
Cluster 3: 401 data points
Classification task
done splitting in task cls
Cluster 4: 904 data points
Classification task
done splitting in task cls
curated_file is: nor_antag_G_GTP_Ki_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_bind_RBA_IC50_curated.csv



100%|██████████| 693/693 [00:00<00:00, 1553.02it/s]


Cluster 0: 82 data points
Classification task
done splitting in task cls
Cluster 1: 315 data points
Classification task
done splitting in task cls
Cluster 2: 154 data points
Classification task
done splitting in task cls
Cluster 3: 88 data points
Classification task
done splitting in task cls
Cluster 4: 54 data points
Classification task
done splitting in task cls
Done!

use_clustering is: False

curated_file is: nor_antag_G_GTP_IC50_curated.csv

curated_file is: kor_antag_B_arrest_IC50_curated.csv

curated_file is: nor_agon_G_Ca_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_antag_B_arrest_IC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_agon_G_cAMP_IC50_curated.csv

The number of data points in this file is lower than 50, which is n

100%|██████████| 120/120 [00:00<00:00, 9117.23it/s]


Cluster 0: 22 data points


100%|██████████| 120/120 [00:00<00:00, 5912.33it/s]
100%|██████████| 120/120 [00:00<00:00, 9161.20it/s]
100%|██████████| 120/120 [00:00<00:00, 16283.82it/s]
100%|██████████| 120/120 [00:00<00:00, 33117.28it/s]


Cluster 1: 44 data points


100%|██████████| 120/120 [00:00<00:00, 9237.03it/s]
100%|██████████| 120/120 [00:00<00:00, 9076.12it/s]
100%|██████████| 120/120 [00:00<00:00, 16358.97it/s]
100%|██████████| 120/120 [00:00<00:00, 33465.19it/s]


Cluster 2: 24 data points


100%|██████████| 120/120 [00:00<00:00, 9229.24it/s]
100%|██████████| 120/120 [00:00<00:00, 9072.36it/s]
100%|██████████| 120/120 [00:00<00:00, 16500.02it/s]
100%|██████████| 120/120 [00:00<00:00, 33093.33it/s]


Cluster 3: 8 data points


100%|██████████| 120/120 [00:00<00:00, 9082.84it/s]
100%|██████████| 120/120 [00:00<00:00, 9191.82it/s]
100%|██████████| 120/120 [00:00<00:00, 16188.49it/s]
100%|██████████| 120/120 [00:00<00:00, 33112.93it/s]


Cluster 4: 22 data points


100%|██████████| 120/120 [00:00<00:00, 9216.23it/s]
100%|██████████| 120/120 [00:00<00:00, 8950.08it/s]
100%|██████████| 120/120 [00:00<00:00, 16263.30it/s]
100%|██████████| 120/120 [00:00<00:00, 33123.82it/s]


curated_file is: kor_antag_B_arrest_IC50_curated.csv



100%|██████████| 52/52 [00:00<00:00, 20638.13it/s]


Cluster 0: 18 data points


100%|██████████| 52/52 [00:00<00:00, 13772.66it/s]
100%|██████████| 52/52 [00:00<00:00, 13716.36it/s]
100%|██████████| 52/52 [00:00<00:00, 25611.06it/s]
100%|██████████| 52/52 [00:00<00:00, 55146.35it/s]


Cluster 1: 6 data points


100%|██████████| 52/52 [00:00<00:00, 13418.47it/s]
100%|██████████| 52/52 [00:00<00:00, 13432.52it/s]
100%|██████████| 52/52 [00:00<00:00, 26440.03it/s]
100%|██████████| 52/52 [00:00<00:00, 56010.22it/s]

An error occurred while stratifying data based on cliff molecules. Skipping stratification.
Cluster 2: 12 data points



100%|██████████| 52/52 [00:00<00:00, 20708.68it/s]
100%|██████████| 52/52 [00:00<00:00, 20750.05it/s]
100%|██████████| 52/52 [00:00<00:00, 36068.10it/s]
100%|██████████| 52/52 [00:00<00:00, 72871.30it/s]


Cluster 3: 10 data points


100%|██████████| 52/52 [00:00<00:00, 20905.19it/s]
100%|██████████| 52/52 [00:00<00:00, 20589.43it/s]
100%|██████████| 52/52 [00:00<00:00, 35389.23it/s]
100%|██████████| 52/52 [00:00<00:00, 73460.36it/s]


Cluster 4: 6 data points


100%|██████████| 52/52 [00:00<00:00, 20919.22it/s]
100%|██████████| 52/52 [00:00<00:00, 20698.85it/s]
100%|██████████| 52/52 [00:00<00:00, 35510.23it/s]
100%|██████████| 52/52 [00:00<00:00, 72387.59it/s]


An error occurred while stratifying data based on cliff molecules. Skipping stratification.
curated_file is: nor_agon_G_Ca_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_antag_B_arrest_IC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_agon_G_cAMP_IC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: kor_agon_G_cAMP_IC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_antag_G_GTP_Ki_curated.csv



100%|██████████| 71/71 [00:00<00:00, 15085.13it/s]


Cluster 0: 5 data points


100%|██████████| 71/71 [00:00<00:00, 9862.09it/s]
100%|██████████| 71/71 [00:00<00:00, 10007.92it/s]
100%|██████████| 71/71 [00:00<00:00, 18531.15it/s]
100%|██████████| 71/71 [00:00<00:00, 43550.10it/s]


An error occurred while stratifying data based on cliff molecules. Skipping stratification.
Cluster 1: 26 data points


100%|██████████| 71/71 [00:00<00:00, 10096.82it/s]
100%|██████████| 71/71 [00:00<00:00, 15647.92it/s]
100%|██████████| 71/71 [00:00<00:00, 24874.34it/s]
100%|██████████| 71/71 [00:00<00:00, 54822.46it/s]


Cluster 2: 11 data points


100%|██████████| 71/71 [00:00<00:00, 15734.73it/s]
100%|██████████| 71/71 [00:00<00:00, 15168.89it/s]
100%|██████████| 71/71 [00:00<00:00, 24635.64it/s]
100%|██████████| 71/71 [00:00<00:00, 56166.65it/s]


Cluster 3: 10 data points


100%|██████████| 71/71 [00:00<00:00, 15550.68it/s]
100%|██████████| 71/71 [00:00<00:00, 15266.09it/s]
100%|██████████| 71/71 [00:00<00:00, 24901.38it/s]
100%|██████████| 71/71 [00:00<00:00, 56411.36it/s]


Cluster 4: 19 data points


100%|██████████| 71/71 [00:00<00:00, 15178.94it/s]
100%|██████████| 71/71 [00:00<00:00, 14782.61it/s]
100%|██████████| 71/71 [00:00<00:00, 24841.14it/s]
100%|██████████| 71/71 [00:00<00:00, 53860.66it/s]


curated_file is: mor_antag_B_arrest_IC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: mor_agon_G_cAMP_IC50_curated.csv



100%|██████████| 53/53 [00:00<00:00, 19791.50it/s]


Cluster 0: 17 data points


100%|██████████| 53/53 [00:00<00:00, 13340.02it/s]
100%|██████████| 53/53 [00:00<00:00, 13084.83it/s]
100%|██████████| 53/53 [00:00<00:00, 22856.07it/s]
100%|██████████| 53/53 [00:00<00:00, 54205.83it/s]


Cluster 1: 5 data points


100%|██████████| 53/53 [00:00<00:00, 13063.30it/s]
100%|██████████| 53/53 [00:00<00:00, 13043.37it/s]
100%|██████████| 53/53 [00:00<00:00, 22357.25it/s]
100%|██████████| 53/53 [00:00<00:00, 54685.88it/s]


Cluster 2: 17 data points


100%|██████████| 53/53 [00:00<00:00, 20030.47it/s]
100%|██████████| 53/53 [00:00<00:00, 20016.04it/s]
100%|██████████| 53/53 [00:00<00:00, 29369.55it/s]
100%|██████████| 53/53 [00:00<00:00, 71547.51it/s]


Cluster 3: 6 data points


100%|██████████| 53/53 [00:00<00:00, 20627.09it/s]
100%|██████████| 53/53 [00:00<00:00, 20258.65it/s]
100%|██████████| 53/53 [00:00<00:00, 31318.42it/s]
100%|██████████| 53/53 [00:00<00:00, 72457.01it/s]


Cluster 4: 8 data points


100%|██████████| 53/53 [00:00<00:00, 20556.51it/s]
100%|██████████| 53/53 [00:00<00:00, 20218.11it/s]
100%|██████████| 53/53 [00:00<00:00, 31212.88it/s]
100%|██████████| 53/53 [00:00<00:00, 72034.38it/s]


curated_file is: mor_agon_G_GTP_EC50_curated.csv



100%|██████████| 869/869 [00:00<00:00, 1265.26it/s]


Cluster 0: 228 data points


100%|██████████| 869/869 [00:00<00:00, 1264.54it/s]
100%|██████████| 869/869 [00:00<00:00, 1268.32it/s]
100%|██████████| 869/869 [00:00<00:00, 1580.13it/s]
100%|██████████| 869/869 [00:00<00:00, 4727.05it/s]


Cluster 1: 238 data points


100%|██████████| 869/869 [00:00<00:00, 1266.14it/s]
100%|██████████| 869/869 [00:00<00:00, 1263.70it/s]
100%|██████████| 869/869 [00:00<00:00, 1579.27it/s]
100%|██████████| 869/869 [00:00<00:00, 4685.66it/s]


Cluster 2: 55 data points


100%|██████████| 869/869 [00:00<00:00, 1266.09it/s]
100%|██████████| 869/869 [00:00<00:00, 1261.70it/s]
100%|██████████| 869/869 [00:00<00:00, 1583.91it/s]
100%|██████████| 869/869 [00:00<00:00, 4693.12it/s]


Cluster 3: 91 data points


100%|██████████| 869/869 [00:00<00:00, 1264.22it/s]
100%|██████████| 869/869 [00:00<00:00, 1265.03it/s]
100%|██████████| 869/869 [00:00<00:00, 1579.22it/s]
100%|██████████| 869/869 [00:00<00:00, 4752.93it/s]


Cluster 4: 257 data points


100%|██████████| 869/869 [00:00<00:00, 1271.81it/s]
100%|██████████| 869/869 [00:00<00:00, 1263.60it/s]
100%|██████████| 869/869 [00:00<00:00, 1576.84it/s]
100%|██████████| 869/869 [00:00<00:00, 4715.71it/s]


curated_file is: dor_agon_G_cAMP_EC50_curated.csv



100%|██████████| 113/113 [00:00<00:00, 9595.04it/s]


Cluster 0: 10 data points


100%|██████████| 113/113 [00:00<00:00, 6418.17it/s]
100%|██████████| 113/113 [00:00<00:00, 9848.44it/s]
100%|██████████| 113/113 [00:00<00:00, 14344.49it/s]
100%|██████████| 113/113 [00:00<00:00, 35727.15it/s]


Cluster 1: 40 data points


100%|██████████| 113/113 [00:00<00:00, 9687.41it/s]
100%|██████████| 113/113 [00:00<00:00, 9699.90it/s]
100%|██████████| 113/113 [00:00<00:00, 14284.40it/s]
100%|██████████| 113/113 [00:00<00:00, 36301.80it/s]


Cluster 2: 28 data points


100%|██████████| 113/113 [00:00<00:00, 9724.77it/s]
100%|██████████| 113/113 [00:00<00:00, 9502.12it/s]
100%|██████████| 113/113 [00:00<00:00, 14207.75it/s]
100%|██████████| 113/113 [00:00<00:00, 36083.47it/s]


Cluster 3: 23 data points


100%|██████████| 113/113 [00:00<00:00, 9525.61it/s]
100%|██████████| 113/113 [00:00<00:00, 9500.03it/s]
100%|██████████| 113/113 [00:00<00:00, 14200.51it/s]
100%|██████████| 113/113 [00:00<00:00, 36066.99it/s]


Cluster 4: 12 data points


100%|██████████| 113/113 [00:00<00:00, 9693.75it/s]
100%|██████████| 113/113 [00:00<00:00, 9735.36it/s]
100%|██████████| 113/113 [00:00<00:00, 14281.82it/s]
100%|██████████| 113/113 [00:00<00:00, 36257.37it/s]


curated_file is: mor_antag_G_GTP_Ki_curated.csv



100%|██████████| 61/61 [00:00<00:00, 17608.57it/s]


Cluster 0: 10 data points


100%|██████████| 61/61 [00:00<00:00, 11440.37it/s]
100%|██████████| 61/61 [00:00<00:00, 11055.29it/s]
100%|██████████| 61/61 [00:00<00:00, 21782.10it/s]
100%|██████████| 61/61 [00:00<00:00, 49070.30it/s]


Cluster 1: 27 data points


100%|██████████| 61/61 [00:00<00:00, 11599.08it/s]
100%|██████████| 61/61 [00:00<00:00, 17680.36it/s]
100%|██████████| 61/61 [00:00<00:00, 28955.70it/s]
100%|██████████| 61/61 [00:00<00:00, 63486.98it/s]


Cluster 2: 6 data points


100%|██████████| 61/61 [00:00<00:00, 17814.55it/s]
100%|██████████| 61/61 [00:00<00:00, 17752.74it/s]
100%|██████████| 61/61 [00:00<00:00, 29037.86it/s]
100%|██████████| 61/61 [00:00<00:00, 64187.79it/s]


Cluster 3: 10 data points


100%|██████████| 61/61 [00:00<00:00, 17773.71it/s]
100%|██████████| 61/61 [00:00<00:00, 17690.14it/s]
100%|██████████| 61/61 [00:00<00:00, 29047.75it/s]
100%|██████████| 61/61 [00:00<00:00, 64365.42it/s]


Cluster 4: 8 data points


100%|██████████| 61/61 [00:00<00:00, 17787.30it/s]
100%|██████████| 61/61 [00:00<00:00, 17549.39it/s]
100%|██████████| 61/61 [00:00<00:00, 29143.70it/s]
100%|██████████| 61/61 [00:00<00:00, 64043.19it/s]


curated_file is: kor_agon_G_Ca_EC50_curated.csv



100%|██████████| 74/74 [00:00<00:00, 14820.15it/s]


Cluster 0: 5 data points


100%|██████████| 74/74 [00:00<00:00, 9529.00it/s]
100%|██████████| 74/74 [00:00<00:00, 9487.35it/s]
100%|██████████| 74/74 [00:00<00:00, 16405.65it/s]
100%|██████████| 74/74 [00:00<00:00, 41723.15it/s]


Cluster 1: 16 data points


100%|██████████| 74/74 [00:00<00:00, 9699.93it/s]
100%|██████████| 74/74 [00:00<00:00, 15116.08it/s]
100%|██████████| 74/74 [00:00<00:00, 21843.80it/s]
100%|██████████| 74/74 [00:00<00:00, 53302.16it/s]


Cluster 2: 45 data points


100%|██████████| 74/74 [00:00<00:00, 15144.11it/s]
100%|██████████| 74/74 [00:00<00:00, 15003.55it/s]
100%|██████████| 74/74 [00:00<00:00, 21680.53it/s]
100%|██████████| 74/74 [00:00<00:00, 53960.10it/s]


Cluster 3: 2 data points


100%|██████████| 74/74 [00:00<00:00, 15014.44it/s]
100%|██████████| 74/74 [00:00<00:00, 14971.71it/s]
100%|██████████| 74/74 [00:00<00:00, 21679.02it/s]
100%|██████████| 74/74 [00:00<00:00, 53903.87it/s]


Cluster 4: 6 data points


100%|██████████| 74/74 [00:00<00:00, 13994.25it/s]
100%|██████████| 74/74 [00:00<00:00, 14683.44it/s]
100%|██████████| 74/74 [00:00<00:00, 21402.46it/s]
100%|██████████| 74/74 [00:00<00:00, 53467.44it/s]


curated_file is: dor_antag_G_GTP_IC50_curated.csv



100%|██████████| 151/151 [00:00<00:00, 7331.34it/s]


Cluster 0: 49 data points


100%|██████████| 151/151 [00:00<00:00, 4778.26it/s]
100%|██████████| 151/151 [00:00<00:00, 7258.49it/s]
100%|██████████| 151/151 [00:00<00:00, 9579.08it/s]
100%|██████████| 151/151 [00:00<00:00, 26833.03it/s]


Cluster 1: 35 data points


100%|██████████| 151/151 [00:00<00:00, 7351.51it/s]
100%|██████████| 151/151 [00:00<00:00, 7030.47it/s]
100%|██████████| 151/151 [00:00<00:00, 9386.29it/s]
100%|██████████| 151/151 [00:00<00:00, 25683.92it/s]


Cluster 2: 15 data points


100%|██████████| 151/151 [00:00<00:00, 7096.96it/s]
100%|██████████| 151/151 [00:00<00:00, 7093.70it/s]
100%|██████████| 151/151 [00:00<00:00, 9401.34it/s]
100%|██████████| 151/151 [00:00<00:00, 27016.16it/s]


Cluster 3: 34 data points


100%|██████████| 151/151 [00:00<00:00, 7009.38it/s]
100%|██████████| 151/151 [00:00<00:00, 7093.94it/s]
100%|██████████| 151/151 [00:00<00:00, 9527.06it/s]
100%|██████████| 151/151 [00:00<00:00, 26727.71it/s]


Cluster 4: 18 data points


100%|██████████| 151/151 [00:00<00:00, 7134.94it/s]
100%|██████████| 151/151 [00:00<00:00, 7021.90it/s]
100%|██████████| 151/151 [00:00<00:00, 9440.72it/s]
100%|██████████| 151/151 [00:00<00:00, 26205.72it/s]


curated_file is: mor_agon_B_arrest_EC50_curated.csv



100%|██████████| 169/169 [00:00<00:00, 6375.01it/s]


Cluster 0: 45 data points


100%|██████████| 169/169 [00:00<00:00, 4339.30it/s]
100%|██████████| 169/169 [00:00<00:00, 6649.82it/s]
100%|██████████| 169/169 [00:00<00:00, 11194.17it/s]
100%|██████████| 169/169 [00:00<00:00, 24267.77it/s]


Cluster 1: 47 data points


100%|██████████| 169/169 [00:00<00:00, 6591.08it/s]
100%|██████████| 169/169 [00:00<00:00, 6352.50it/s]
100%|██████████| 169/169 [00:00<00:00, 10804.13it/s]
100%|██████████| 169/169 [00:00<00:00, 23714.87it/s]


Cluster 2: 46 data points


100%|██████████| 169/169 [00:00<00:00, 6452.54it/s]
100%|██████████| 169/169 [00:00<00:00, 6325.12it/s]
100%|██████████| 169/169 [00:00<00:00, 10757.73it/s]
100%|██████████| 169/169 [00:00<00:00, 23914.08it/s]


Cluster 3: 11 data points


100%|██████████| 169/169 [00:00<00:00, 6362.48it/s]
100%|██████████| 169/169 [00:00<00:00, 6281.74it/s]
100%|██████████| 169/169 [00:00<00:00, 11112.74it/s]
100%|██████████| 169/169 [00:00<00:00, 23008.22it/s]


Cluster 4: 20 data points


100%|██████████| 169/169 [00:00<00:00, 6486.02it/s]
100%|██████████| 169/169 [00:00<00:00, 6230.44it/s]
100%|██████████| 169/169 [00:00<00:00, 10993.14it/s]
100%|██████████| 169/169 [00:00<00:00, 23846.51it/s]


curated_file is: kor_bind_RBA_IC50_curated.csv



100%|██████████| 408/408 [00:00<00:00, 2697.39it/s]


Cluster 0: 68 data points


100%|██████████| 408/408 [00:00<00:00, 2490.38it/s]
100%|██████████| 408/408 [00:00<00:00, 2631.77it/s]
100%|██████████| 408/408 [00:00<00:00, 4416.31it/s]
100%|██████████| 408/408 [00:00<00:00, 10020.41it/s]


Cluster 1: 218 data points


100%|██████████| 408/408 [00:00<00:00, 2669.76it/s]
100%|██████████| 408/408 [00:00<00:00, 2666.47it/s]
100%|██████████| 408/408 [00:00<00:00, 4476.11it/s]
100%|██████████| 408/408 [00:00<00:00, 10036.28it/s]


Cluster 2: 20 data points


100%|██████████| 408/408 [00:00<00:00, 2729.97it/s]
100%|██████████| 408/408 [00:00<00:00, 2644.11it/s]
100%|██████████| 408/408 [00:00<00:00, 4439.97it/s]
100%|██████████| 408/408 [00:00<00:00, 10027.16it/s]


Cluster 3: 53 data points


100%|██████████| 408/408 [00:00<00:00, 2699.16it/s]
100%|██████████| 408/408 [00:00<00:00, 2634.44it/s]
100%|██████████| 408/408 [00:00<00:00, 4473.14it/s]
100%|██████████| 408/408 [00:00<00:00, 10050.13it/s]


Cluster 4: 49 data points


100%|██████████| 408/408 [00:00<00:00, 2709.00it/s]
100%|██████████| 408/408 [00:00<00:00, 2644.73it/s]
100%|██████████| 408/408 [00:00<00:00, 4492.34it/s]
100%|██████████| 408/408 [00:00<00:00, 9880.06it/s]


curated_file is: kor_agon_G_cAMP_EC50_curated.csv



100%|██████████| 237/237 [00:00<00:00, 4696.98it/s]


Cluster 0: 59 data points


100%|██████████| 237/237 [00:00<00:00, 3232.25it/s]
100%|██████████| 237/237 [00:00<00:00, 4578.74it/s]
100%|██████████| 237/237 [00:00<00:00, 4759.82it/s]
100%|██████████| 237/237 [00:00<00:00, 16961.00it/s]


Cluster 1: 63 data points


100%|██████████| 237/237 [00:00<00:00, 4665.89it/s]
100%|██████████| 237/237 [00:00<00:00, 4536.29it/s]
100%|██████████| 237/237 [00:00<00:00, 4818.82it/s]
100%|██████████| 237/237 [00:00<00:00, 17106.06it/s]


Cluster 2: 68 data points


100%|██████████| 237/237 [00:00<00:00, 4621.15it/s]
100%|██████████| 237/237 [00:00<00:00, 4516.71it/s]
100%|██████████| 237/237 [00:00<00:00, 4766.14it/s]
100%|██████████| 237/237 [00:00<00:00, 17237.16it/s]


Cluster 3: 30 data points


100%|██████████| 237/237 [00:00<00:00, 4594.21it/s]
100%|██████████| 237/237 [00:00<00:00, 4564.15it/s]
100%|██████████| 237/237 [00:00<00:00, 4791.13it/s]
100%|██████████| 237/237 [00:00<00:00, 17313.72it/s]


Cluster 4: 17 data points


100%|██████████| 237/237 [00:00<00:00, 4558.02it/s]
100%|██████████| 237/237 [00:00<00:00, 4545.00it/s]
100%|██████████| 237/237 [00:00<00:00, 4800.83it/s]
100%|██████████| 237/237 [00:00<00:00, 16778.07it/s]


An error occurred while stratifying data based on cliff molecules. Skipping stratification.
curated_file is: kor_agon_G_GTP_EC50_curated.csv



100%|██████████| 1022/1022 [00:00<00:00, 1065.87it/s]


Cluster 0: 359 data points


100%|██████████| 1022/1022 [00:00<00:00, 1085.22it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1055.84it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1274.07it/s]
100%|██████████| 1022/1022 [00:00<00:00, 3953.21it/s]


Cluster 1: 233 data points


100%|██████████| 1022/1022 [00:00<00:00, 1081.61it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1050.27it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1269.62it/s]
100%|██████████| 1022/1022 [00:00<00:00, 3942.07it/s]


Cluster 2: 112 data points


100%|██████████| 1022/1022 [00:00<00:00, 1076.64it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1055.45it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1268.50it/s]
100%|██████████| 1022/1022 [00:00<00:00, 3955.67it/s]


Cluster 3: 88 data points


100%|██████████| 1022/1022 [00:00<00:00, 1077.28it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1042.08it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1271.25it/s]
100%|██████████| 1022/1022 [00:00<00:00, 3917.89it/s]


Cluster 4: 230 data points


100%|██████████| 1022/1022 [00:00<00:00, 1077.77it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1051.65it/s]
100%|██████████| 1022/1022 [00:00<00:00, 1265.69it/s]
100%|██████████| 1022/1022 [00:00<00:00, 3942.47it/s]


curated_file is: nor_agon_G_GTP_EC50_curated.csv



100%|██████████| 190/190 [00:00<00:00, 5782.39it/s]


Cluster 0: 26 data points


100%|██████████| 190/190 [00:00<00:00, 3819.78it/s]
100%|██████████| 190/190 [00:00<00:00, 5742.97it/s]
100%|██████████| 190/190 [00:00<00:00, 9988.94it/s]
100%|██████████| 190/190 [00:00<00:00, 21634.80it/s]


Cluster 1: 40 data points


100%|██████████| 190/190 [00:00<00:00, 5745.70it/s]
100%|██████████| 190/190 [00:00<00:00, 5912.60it/s]
100%|██████████| 190/190 [00:00<00:00, 9870.54it/s]
100%|██████████| 190/190 [00:00<00:00, 21220.58it/s]


Cluster 2: 61 data points


100%|██████████| 190/190 [00:00<00:00, 5833.91it/s]
100%|██████████| 190/190 [00:00<00:00, 5650.34it/s]
100%|██████████| 190/190 [00:00<00:00, 9853.94it/s]
100%|██████████| 190/190 [00:00<00:00, 20932.96it/s]


Cluster 3: 43 data points


100%|██████████| 190/190 [00:00<00:00, 5829.17it/s]
100%|██████████| 190/190 [00:00<00:00, 5733.84it/s]
100%|██████████| 190/190 [00:00<00:00, 9922.03it/s]
100%|██████████| 190/190 [00:00<00:00, 21612.50it/s]


Cluster 4: 20 data points


100%|██████████| 190/190 [00:00<00:00, 5814.67it/s]
100%|██████████| 190/190 [00:00<00:00, 5638.78it/s]
100%|██████████| 190/190 [00:00<00:00, 9886.34it/s]
100%|██████████| 190/190 [00:00<00:00, 21788.59it/s]


curated_file is: mor_bind_RBA_IC50_curated.csv



100%|██████████| 561/561 [00:00<00:00, 1970.44it/s]


Cluster 0: 20 data points


100%|██████████| 561/561 [00:00<00:00, 2011.64it/s]
100%|██████████| 561/561 [00:00<00:00, 1955.06it/s]
100%|██████████| 561/561 [00:00<00:00, 3246.95it/s]
100%|██████████| 561/561 [00:00<00:00, 7226.34it/s]


Cluster 1: 77 data points


100%|██████████| 561/561 [00:00<00:00, 1960.48it/s]
100%|██████████| 561/561 [00:00<00:00, 1918.84it/s]
100%|██████████| 561/561 [00:00<00:00, 3230.60it/s]
100%|██████████| 561/561 [00:00<00:00, 7227.96it/s]


Cluster 2: 101 data points


100%|██████████| 561/561 [00:00<00:00, 1960.83it/s]
100%|██████████| 561/561 [00:00<00:00, 1924.38it/s]
100%|██████████| 561/561 [00:00<00:00, 3221.85it/s]
100%|██████████| 561/561 [00:00<00:00, 7296.68it/s]


Cluster 3: 322 data points


100%|██████████| 561/561 [00:00<00:00, 1977.83it/s]
100%|██████████| 561/561 [00:00<00:00, 1928.14it/s]
100%|██████████| 561/561 [00:00<00:00, 3247.28it/s]
100%|██████████| 561/561 [00:00<00:00, 7246.21it/s]


Cluster 4: 41 data points


100%|██████████| 561/561 [00:00<00:00, 1976.80it/s]
100%|██████████| 561/561 [00:00<00:00, 1962.05it/s]
100%|██████████| 561/561 [00:00<00:00, 3222.38it/s]
100%|██████████| 561/561 [00:00<00:00, 7253.87it/s]


curated_file is: kor_antag_G_GTP_IC50_curated.csv



100%|██████████| 133/133 [00:00<00:00, 8200.55it/s]


Cluster 0: 78 data points


100%|██████████| 133/133 [00:00<00:00, 5311.47it/s]
100%|██████████| 133/133 [00:00<00:00, 8475.66it/s]
100%|██████████| 133/133 [00:00<00:00, 13207.75it/s]
100%|██████████| 133/133 [00:00<00:00, 30708.05it/s]


Cluster 1: 19 data points


100%|██████████| 133/133 [00:00<00:00, 8281.02it/s]
100%|██████████| 133/133 [00:00<00:00, 8167.77it/s]
100%|██████████| 133/133 [00:00<00:00, 12886.17it/s]
100%|██████████| 133/133 [00:00<00:00, 30811.51it/s]


Cluster 2: 5 data points


100%|██████████| 133/133 [00:00<00:00, 8158.69it/s]
100%|██████████| 133/133 [00:00<00:00, 8050.72it/s]
100%|██████████| 133/133 [00:00<00:00, 13271.54it/s]
100%|██████████| 133/133 [00:00<00:00, 30622.08it/s]

Cluster 3: 16 data points



100%|██████████| 133/133 [00:00<00:00, 8248.57it/s]
100%|██████████| 133/133 [00:00<00:00, 7949.76it/s]
100%|██████████| 133/133 [00:00<00:00, 13189.32it/s]
100%|██████████| 133/133 [00:00<00:00, 30718.20it/s]

Cluster 4: 15 data points



100%|██████████| 133/133 [00:00<00:00, 8131.46it/s]
100%|██████████| 133/133 [00:00<00:00, 8061.66it/s]
100%|██████████| 133/133 [00:00<00:00, 13304.45it/s]
100%|██████████| 133/133 [00:00<00:00, 30730.04it/s]


curated_file is: nor_agon_G_cAMP_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: nor_bind_RBA_IC50_curated.csv



100%|██████████| 417/417 [00:00<00:00, 2672.76it/s]


Cluster 0: 201 data points


100%|██████████| 417/417 [00:00<00:00, 2416.47it/s]
100%|██████████| 417/417 [00:00<00:00, 2581.35it/s]
100%|██████████| 417/417 [00:00<00:00, 4554.46it/s]
100%|██████████| 417/417 [00:00<00:00, 9706.61it/s]


Cluster 1: 46 data points


100%|██████████| 417/417 [00:00<00:00, 2633.11it/s]
100%|██████████| 417/417 [00:00<00:00, 2559.51it/s]
100%|██████████| 417/417 [00:00<00:00, 4584.31it/s]
100%|██████████| 417/417 [00:00<00:00, 9681.79it/s]


Cluster 2: 33 data points


100%|██████████| 417/417 [00:00<00:00, 2635.05it/s]
100%|██████████| 417/417 [00:00<00:00, 2570.45it/s]
100%|██████████| 417/417 [00:00<00:00, 4571.13it/s]
100%|██████████| 417/417 [00:00<00:00, 9621.60it/s]


Cluster 3: 75 data points


100%|██████████| 417/417 [00:00<00:00, 2621.80it/s]
100%|██████████| 417/417 [00:00<00:00, 2570.66it/s]
100%|██████████| 417/417 [00:00<00:00, 4637.38it/s]
100%|██████████| 417/417 [00:00<00:00, 9650.27it/s]


Cluster 4: 62 data points


100%|██████████| 417/417 [00:00<00:00, 2635.44it/s]
100%|██████████| 417/417 [00:00<00:00, 2586.03it/s]
100%|██████████| 417/417 [00:00<00:00, 4575.21it/s]
100%|██████████| 417/417 [00:00<00:00, 9729.72it/s]


curated_file is: nor_agon_G_cAMP_IC50_curated.csv



100%|██████████| 59/59 [00:00<00:00, 17759.72it/s]


Cluster 0: 27 data points


100%|██████████| 59/59 [00:00<00:00, 11487.51it/s]
100%|██████████| 59/59 [00:00<00:00, 12077.89it/s]
100%|██████████| 59/59 [00:00<00:00, 24237.41it/s]
100%|██████████| 59/59 [00:00<00:00, 50845.27it/s]


Cluster 1: 7 data points


100%|██████████| 59/59 [00:00<00:00, 12111.59it/s]
100%|██████████| 59/59 [00:00<00:00, 18217.31it/s]
100%|██████████| 59/59 [00:00<00:00, 32356.69it/s]
100%|██████████| 59/59 [00:00<00:00, 65466.65it/s]


Cluster 2: 6 data points


100%|██████████| 59/59 [00:00<00:00, 18631.53it/s]
100%|██████████| 59/59 [00:00<00:00, 18500.59it/s]
100%|██████████| 59/59 [00:00<00:00, 32276.50it/s]
100%|██████████| 59/59 [00:00<00:00, 65920.07it/s]


Cluster 3: 7 data points


100%|██████████| 59/59 [00:00<00:00, 18538.01it/s]
100%|██████████| 59/59 [00:00<00:00, 18185.18it/s]
100%|██████████| 59/59 [00:00<00:00, 32754.99it/s]
100%|██████████| 59/59 [00:00<00:00, 65397.45it/s]


Cluster 4: 12 data points


100%|██████████| 59/59 [00:00<00:00, 18463.32it/s]
100%|██████████| 59/59 [00:00<00:00, 18006.54it/s]
100%|██████████| 59/59 [00:00<00:00, 32318.65it/s]
100%|██████████| 59/59 [00:00<00:00, 66043.22it/s]


curated_file is: dor_agon_B_arrest_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_agon_G_Ca_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: mor_bind_RBA_Ki_curated.csv



100%|██████████| 4523/4523 [00:18<00:00, 239.74it/s] 


Cluster 0: 514 data points


100%|██████████| 4523/4523 [00:18<00:00, 241.11it/s] 
[10:07:30] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 4523/4523 [00:19<00:00, 237.78it/s] 
100%|██████████| 4523/4523 [00:12<00:00, 349.50it/s] 
100%|██████████| 4523/4523 [00:05<00:00, 889.46it/s] 


Cluster 1: 225 data points


100%|██████████| 4523/4523 [00:18<00:00, 240.91it/s] 
[10:08:32] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 4523/4523 [00:19<00:00, 236.62it/s] 
100%|██████████| 4523/4523 [00:13<00:00, 346.66it/s] 
100%|██████████| 4523/4523 [00:05<00:00, 885.28it/s] 


Cluster 2: 1200 data points


100%|██████████| 4523/4523 [00:18<00:00, 242.65it/s] 
[10:09:33] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 4523/4523 [00:19<00:00, 236.83it/s] 
100%|██████████| 4523/4523 [00:13<00:00, 345.67it/s] 
100%|██████████| 4523/4523 [00:05<00:00, 879.25it/s] 


Cluster 3: 2437 data points


100%|██████████| 4523/4523 [00:18<00:00, 240.56it/s] 
[10:10:35] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 4523/4523 [00:18<00:00, 241.21it/s] 
100%|██████████| 4523/4523 [00:13<00:00, 345.70it/s] 
100%|██████████| 4523/4523 [00:05<00:00, 897.05it/s] 


Cluster 4: 147 data points


100%|██████████| 4523/4523 [00:18<00:00, 240.17it/s] 
[10:11:37] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 4523/4523 [00:19<00:00, 236.31it/s] 
100%|██████████| 4523/4523 [00:12<00:00, 349.74it/s] 
100%|██████████| 4523/4523 [00:05<00:00, 877.49it/s] 


curated_file is: dor_agon_G_GTP_EC50_curated.csv



100%|██████████| 581/581 [00:00<00:00, 1896.35it/s]


Cluster 0: 201 data points


100%|██████████| 581/581 [00:00<00:00, 1923.96it/s]
100%|██████████| 581/581 [00:00<00:00, 1838.35it/s]
100%|██████████| 581/581 [00:00<00:00, 2462.58it/s]
100%|██████████| 581/581 [00:00<00:00, 6929.05it/s]


Cluster 1: 114 data points


100%|██████████| 581/581 [00:00<00:00, 1883.97it/s]
100%|██████████| 581/581 [00:00<00:00, 1824.30it/s]
100%|██████████| 581/581 [00:00<00:00, 2462.32it/s]
100%|██████████| 581/581 [00:00<00:00, 6910.48it/s]


Cluster 2: 39 data points


100%|██████████| 581/581 [00:00<00:00, 1878.70it/s]
100%|██████████| 581/581 [00:00<00:00, 1892.37it/s]
100%|██████████| 581/581 [00:00<00:00, 2451.01it/s]
100%|██████████| 581/581 [00:00<00:00, 6986.46it/s]


Cluster 3: 89 data points


100%|██████████| 581/581 [00:00<00:00, 1901.44it/s]
100%|██████████| 581/581 [00:00<00:00, 1828.02it/s]
100%|██████████| 581/581 [00:00<00:00, 2448.87it/s]
100%|██████████| 581/581 [00:00<00:00, 6915.15it/s]


Cluster 4: 138 data points


100%|██████████| 581/581 [00:00<00:00, 1881.29it/s]
100%|██████████| 581/581 [00:00<00:00, 1822.03it/s]
100%|██████████| 581/581 [00:00<00:00, 2452.17it/s]
100%|██████████| 581/581 [00:00<00:00, 6920.53it/s]


curated_file is: mor_agon_G_Ca_EC50_curated.csv



100%|██████████| 144/144 [00:00<00:00, 7352.25it/s]


Cluster 0: 24 data points


100%|██████████| 144/144 [00:00<00:00, 4978.48it/s]
100%|██████████| 144/144 [00:00<00:00, 7838.70it/s]
100%|██████████| 144/144 [00:00<00:00, 12104.29it/s]
100%|██████████| 144/144 [00:00<00:00, 28494.99it/s]


Cluster 1: 25 data points


100%|██████████| 144/144 [00:00<00:00, 7757.15it/s]
100%|██████████| 144/144 [00:00<00:00, 7354.40it/s]
100%|██████████| 144/144 [00:00<00:00, 11840.19it/s]
100%|██████████| 144/144 [00:00<00:00, 26862.65it/s]


Cluster 2: 26 data points


100%|██████████| 144/144 [00:00<00:00, 7715.04it/s]
100%|██████████| 144/144 [00:00<00:00, 7221.10it/s]
100%|██████████| 144/144 [00:00<00:00, 12031.71it/s]
100%|██████████| 144/144 [00:00<00:00, 27333.11it/s]


Cluster 3: 35 data points


100%|██████████| 144/144 [00:00<00:00, 7285.91it/s]
100%|██████████| 144/144 [00:00<00:00, 7414.62it/s]
100%|██████████| 144/144 [00:00<00:00, 11849.48it/s]
100%|██████████| 144/144 [00:00<00:00, 27631.98it/s]


Cluster 4: 34 data points


100%|██████████| 144/144 [00:00<00:00, 7655.59it/s]
100%|██████████| 144/144 [00:00<00:00, 7517.61it/s]
100%|██████████| 144/144 [00:00<00:00, 12058.37it/s]
100%|██████████| 144/144 [00:00<00:00, 27654.75it/s]


curated_file is: kor_antag_G_GTP_Ki_curated.csv



100%|██████████| 53/53 [00:00<00:00, 20095.65it/s]


Cluster 0: 12 data points


100%|██████████| 53/53 [00:00<00:00, 12259.99it/s]
100%|██████████| 53/53 [00:00<00:00, 13502.07it/s]
100%|██████████| 53/53 [00:00<00:00, 23180.20it/s]
100%|██████████| 53/53 [00:00<00:00, 39143.88it/s]


Cluster 1: 3 data points


100%|██████████| 53/53 [00:00<00:00, 13269.15it/s]
100%|██████████| 53/53 [00:00<00:00, 13211.58it/s]
100%|██████████| 53/53 [00:00<00:00, 24710.77it/s]
100%|██████████| 53/53 [00:00<00:00, 55449.77it/s]


Cluster 2: 10 data points


100%|██████████| 53/53 [00:00<00:00, 20373.76it/s]
100%|██████████| 53/53 [00:00<00:00, 19823.27it/s]
100%|██████████| 53/53 [00:00<00:00, 33569.63it/s]
100%|██████████| 53/53 [00:00<00:00, 73100.33it/s]


Cluster 3: 10 data points


100%|██████████| 53/53 [00:00<00:00, 20684.67it/s]
100%|██████████| 53/53 [00:00<00:00, 20876.98it/s]
100%|██████████| 53/53 [00:00<00:00, 33463.51it/s]
100%|██████████| 53/53 [00:00<00:00, 72174.71it/s]


Cluster 4: 18 data points


100%|██████████| 53/53 [00:00<00:00, 20876.98it/s]
100%|██████████| 53/53 [00:00<00:00, 20095.65it/s]
100%|██████████| 53/53 [00:00<00:00, 33443.37it/s]
100%|██████████| 53/53 [00:00<00:00, 73681.84it/s]


curated_file is: mor_antag_G_GTP_IC50_curated.csv



100%|██████████| 268/268 [00:00<00:00, 4137.87it/s]


Cluster 0: 58 data points


100%|██████████| 268/268 [00:00<00:00, 3058.13it/s]
100%|██████████| 268/268 [00:00<00:00, 4057.69it/s]
100%|██████████| 268/268 [00:00<00:00, 6309.77it/s]
100%|██████████| 268/268 [00:00<00:00, 15103.03it/s]


Cluster 1: 50 data points


100%|██████████| 268/268 [00:00<00:00, 4063.78it/s]
100%|██████████| 268/268 [00:00<00:00, 4045.54it/s]
100%|██████████| 268/268 [00:00<00:00, 6314.84it/s]
100%|██████████| 268/268 [00:00<00:00, 15192.24it/s]


Cluster 2: 75 data points


100%|██████████| 268/268 [00:00<00:00, 4098.25it/s]
100%|██████████| 268/268 [00:00<00:00, 4101.48it/s]
100%|██████████| 268/268 [00:00<00:00, 6327.96it/s]
100%|██████████| 268/268 [00:00<00:00, 14940.63it/s]


Cluster 3: 52 data points


100%|██████████| 268/268 [00:00<00:00, 4099.41it/s]
100%|██████████| 268/268 [00:00<00:00, 4039.39it/s]
100%|██████████| 268/268 [00:00<00:00, 6322.30it/s]
100%|██████████| 268/268 [00:00<00:00, 15195.32it/s]


Cluster 4: 33 data points


100%|██████████| 268/268 [00:00<00:00, 4069.81it/s]
100%|██████████| 268/268 [00:00<00:00, 4076.33it/s]
100%|██████████| 268/268 [00:00<00:00, 6306.80it/s]
100%|██████████| 268/268 [00:00<00:00, 14969.28it/s]


curated_file is: nor_bind_RBA_Ki_curated.csv



100%|██████████| 1126/1126 [00:01<00:00, 974.45it/s]


Cluster 0: 85 data points


100%|██████████| 1126/1126 [00:01<00:00, 971.82it/s]
100%|██████████| 1126/1126 [00:01<00:00, 944.17it/s]
100%|██████████| 1126/1126 [00:00<00:00, 1671.19it/s]
100%|██████████| 1126/1126 [00:00<00:00, 3618.26it/s]


Cluster 1: 510 data points


100%|██████████| 1126/1126 [00:01<00:00, 978.19it/s]
100%|██████████| 1126/1126 [00:01<00:00, 950.40it/s]
100%|██████████| 1126/1126 [00:00<00:00, 1664.87it/s]
100%|██████████| 1126/1126 [00:00<00:00, 3566.73it/s]


Cluster 2: 245 data points


100%|██████████| 1126/1126 [00:01<00:00, 983.53it/s]
100%|██████████| 1126/1126 [00:01<00:00, 950.03it/s]
100%|██████████| 1126/1126 [00:00<00:00, 1666.98it/s]
100%|██████████| 1126/1126 [00:00<00:00, 3631.01it/s]


Cluster 3: 145 data points


100%|██████████| 1126/1126 [00:01<00:00, 978.00it/s]
100%|██████████| 1126/1126 [00:01<00:00, 951.49it/s]
100%|██████████| 1126/1126 [00:00<00:00, 1675.59it/s]
100%|██████████| 1126/1126 [00:00<00:00, 3610.49it/s]


Cluster 4: 141 data points


100%|██████████| 1126/1126 [00:01<00:00, 979.72it/s]
100%|██████████| 1126/1126 [00:01<00:00, 949.31it/s]
100%|██████████| 1126/1126 [00:00<00:00, 1650.32it/s]
100%|██████████| 1126/1126 [00:00<00:00, 3595.96it/s]


curated_file is: mor_agon_G_cAMP_EC50_curated.csv



100%|██████████| 418/418 [00:00<00:00, 2654.47it/s]


Cluster 0: 134 data points


100%|██████████| 418/418 [00:00<00:00, 2357.36it/s]
100%|██████████| 418/418 [00:00<00:00, 2594.75it/s]
100%|██████████| 418/418 [00:00<00:00, 3984.62it/s]
100%|██████████| 418/418 [00:00<00:00, 9698.46it/s]


Cluster 1: 57 data points


100%|██████████| 418/418 [00:00<00:00, 2634.91it/s]
100%|██████████| 418/418 [00:00<00:00, 2571.25it/s]
100%|██████████| 418/418 [00:00<00:00, 3984.14it/s]
100%|██████████| 418/418 [00:00<00:00, 9667.71it/s]


Cluster 2: 81 data points


100%|██████████| 418/418 [00:00<00:00, 2625.20it/s]
100%|██████████| 418/418 [00:00<00:00, 2574.79it/s]
100%|██████████| 418/418 [00:00<00:00, 3977.36it/s]
100%|██████████| 418/418 [00:00<00:00, 9695.61it/s]


Cluster 3: 60 data points


100%|██████████| 418/418 [00:00<00:00, 2617.33it/s]
100%|██████████| 418/418 [00:00<00:00, 2583.43it/s]
100%|██████████| 418/418 [00:00<00:00, 3964.39it/s]
100%|██████████| 418/418 [00:00<00:00, 9746.93it/s]


Cluster 4: 86 data points


100%|██████████| 418/418 [00:00<00:00, 2554.08it/s]
100%|██████████| 418/418 [00:00<00:00, 2556.59it/s]
100%|██████████| 418/418 [00:00<00:00, 3976.55it/s]
100%|██████████| 418/418 [00:00<00:00, 9712.69it/s]


curated_file is: kor_agon_B_arrest_EC50_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_bind_RBA_Ki_curated.csv



100%|██████████| 3923/3923 [00:14<00:00, 280.12it/s] 


Cluster 0: 445 data points


100%|██████████| 3923/3923 [00:13<00:00, 282.44it/s] 
[10:13:43] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 3923/3923 [00:13<00:00, 283.97it/s] 
100%|██████████| 3923/3923 [00:10<00:00, 386.73it/s] 
100%|██████████| 3923/3923 [00:03<00:00, 1052.35it/s]


Cluster 1: 2233 data points


100%|██████████| 3923/3923 [00:13<00:00, 283.19it/s] 
[10:14:30] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 3923/3923 [00:14<00:00, 279.99it/s] 
100%|██████████| 3923/3923 [00:10<00:00, 381.00it/s] 
100%|██████████| 3923/3923 [00:03<00:00, 1023.25it/s]


Cluster 2: 871 data points


100%|██████████| 3923/3923 [00:14<00:00, 278.48it/s] 
[10:15:17] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 3923/3923 [00:14<00:00, 278.53it/s] 
100%|██████████| 3923/3923 [00:10<00:00, 380.36it/s] 
100%|██████████| 3923/3923 [00:03<00:00, 1015.14it/s]


Cluster 3: 148 data points


100%|██████████| 3923/3923 [00:13<00:00, 280.35it/s] 
[10:16:04] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 3923/3923 [00:14<00:00, 278.64it/s] 
100%|██████████| 3923/3923 [00:10<00:00, 378.74it/s] 
100%|██████████| 3923/3923 [00:03<00:00, 1021.93it/s]


Cluster 4: 226 data points


100%|██████████| 3923/3923 [00:14<00:00, 280.00it/s] 
[10:16:51] Explicit valence for atom # 12 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(NC(=O)c2cccc(S(F)(F)(F)(F)F)c2)cc(-n2ccn3nc(-c4cccnc4)cc23)c1, then used a normal scaffold instead


100%|██████████| 3923/3923 [00:13<00:00, 280.31it/s] 
100%|██████████| 3923/3923 [00:10<00:00, 381.39it/s] 
100%|██████████| 3923/3923 [00:03<00:00, 1039.02it/s]


curated_file is: kor_bind_RBA_Ki_curated.csv



100%|██████████| 3759/3759 [00:12<00:00, 289.78it/s] 


Cluster 0: 398 data points


100%|██████████| 3759/3759 [00:13<00:00, 288.57it/s] 
[10:17:55] Explicit valence for atom # 32 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(C)c(-n2ccn3nc(-c4cccnc4)cc23)cc1NC(=O)c1cc(C#N)cc(S(F)(F)(F)(F)F)c1, then used a normal scaffold instead


100%|██████████| 3759/3759 [00:13<00:00, 282.22it/s] 
100%|██████████| 3759/3759 [00:09<00:00, 395.08it/s] 
100%|██████████| 3759/3759 [00:03<00:00, 1061.15it/s]


Cluster 1: 1858 data points


100%|██████████| 3759/3759 [00:13<00:00, 288.43it/s] 
[10:18:39] Explicit valence for atom # 32 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(C)c(-n2ccn3nc(-c4cccnc4)cc23)cc1NC(=O)c1cc(C#N)cc(S(F)(F)(F)(F)F)c1, then used a normal scaffold instead


100%|██████████| 3759/3759 [00:13<00:00, 282.41it/s] 
100%|██████████| 3759/3759 [00:09<00:00, 395.03it/s] 
100%|██████████| 3759/3759 [00:03<00:00, 1065.91it/s]


Cluster 2: 290 data points


100%|██████████| 3759/3759 [00:13<00:00, 288.21it/s] 
[10:19:23] Explicit valence for atom # 32 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(C)c(-n2ccn3nc(-c4cccnc4)cc23)cc1NC(=O)c1cc(C#N)cc(S(F)(F)(F)(F)F)c1, then used a normal scaffold instead


100%|██████████| 3759/3759 [00:13<00:00, 285.21it/s] 
100%|██████████| 3759/3759 [00:09<00:00, 396.70it/s] 
100%|██████████| 3759/3759 [00:03<00:00, 1066.85it/s]


Cluster 3: 898 data points


100%|██████████| 3759/3759 [00:13<00:00, 288.62it/s] 
[10:20:07] Explicit valence for atom # 32 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(C)c(-n2ccn3nc(-c4cccnc4)cc23)cc1NC(=O)c1cc(C#N)cc(S(F)(F)(F)(F)F)c1, then used a normal scaffold instead


100%|██████████| 3759/3759 [00:13<00:00, 287.06it/s] 
100%|██████████| 3759/3759 [00:09<00:00, 396.92it/s] 
100%|██████████| 3759/3759 [00:03<00:00, 1069.16it/s]


Cluster 4: 315 data points


100%|██████████| 3759/3759 [00:13<00:00, 287.24it/s] 
[10:20:51] Explicit valence for atom # 32 C, 6, is greater than permitted


Could not create a generic scaffold of Cc1cc(C)c(-n2ccn3nc(-c4cccnc4)cc23)cc1NC(=O)c1cc(C#N)cc(S(F)(F)(F)(F)F)c1, then used a normal scaffold instead


100%|██████████| 3759/3759 [00:13<00:00, 284.88it/s] 
100%|██████████| 3759/3759 [00:09<00:00, 395.09it/s] 
100%|██████████| 3759/3759 [00:03<00:00, 1064.46it/s]


curated_file is: nor_antag_G_GTP_Ki_curated.csv

The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.
curated_file is: dor_bind_RBA_IC50_curated.csv



100%|██████████| 672/672 [00:00<00:00, 1614.09it/s]


Cluster 0: 297 data points


100%|██████████| 672/672 [00:00<00:00, 1647.29it/s]
100%|██████████| 672/672 [00:00<00:00, 1582.48it/s]
100%|██████████| 672/672 [00:00<00:00, 2490.39it/s]
100%|██████████| 672/672 [00:00<00:00, 6033.38it/s]


Cluster 1: 143 data points


100%|██████████| 672/672 [00:00<00:00, 1628.44it/s]
100%|██████████| 672/672 [00:00<00:00, 1584.32it/s]
100%|██████████| 672/672 [00:00<00:00, 2474.74it/s]
100%|██████████| 672/672 [00:00<00:00, 6002.81it/s]


Cluster 2: 87 data points


100%|██████████| 672/672 [00:00<00:00, 1623.66it/s]
100%|██████████| 672/672 [00:00<00:00, 1602.71it/s]
100%|██████████| 672/672 [00:00<00:00, 2457.33it/s]
100%|██████████| 672/672 [00:00<00:00, 6008.38it/s]


Cluster 3: 83 data points


100%|██████████| 672/672 [00:00<00:00, 1610.28it/s]
100%|██████████| 672/672 [00:00<00:00, 1596.90it/s]
100%|██████████| 672/672 [00:00<00:00, 2461.48it/s]
100%|██████████| 672/672 [00:00<00:00, 5973.89it/s]


Cluster 4: 62 data points


100%|██████████| 672/672 [00:00<00:00, 1638.13it/s]
100%|██████████| 672/672 [00:00<00:00, 1600.09it/s]
100%|██████████| 672/672 [00:00<00:00, 2480.65it/s]
100%|██████████| 672/672 [00:00<00:00, 5938.12it/s]


Done!

use_clustering is: False

curated_file is: nor_antag_G_GTP_IC50_curated.csv



UnboundLocalError: local variable 'cliff_mols' referenced before assignment