In [7]:
# conda env: pyg (Python3.9.16)
import shutil
import argparse

from datacat4ml.const import *
from datacat4ml.Scripts.data_prep.data_curate.utils.curate_dataset_type import curate_datasets_and_get_stats

In [8]:
def main():

    """
    run the curation process on datasets including:
    - categorize datasets for ORs
    - heterogeneous datasets for ORs
    - heterogeneous datasets for GPCRs

    and get the stats for each dataset type.

    """

    for task in Tasks:
        # keep the 'task' in the argument to ensure the stats are generated only once
        print(f'----------->Task is {task}\n')


        ## ==== categorized data for ORs ====
        print('Processing categorized datasets of ORs...')
        #if os.path.exists(os.path.join(CURA_CAT_DATASETS_DIR, task)):
        #    # remove the directory and its contents
        #    shutil.rmtree(os.path.join(CURA_CAT_DATASETS_DIR, task))

        # Binding affinity
        curate_datasets_and_get_stats(dataset_type='cat', task=task, target_list=OR_names, effect='bind', assay='RBA', std_types=["Ki", 'IC50'], 
                                    input_path=CAT_DATASETS_DIR, output_path= CURA_CAT_DATASETS_DIR)

        # Agonism
        curate_datasets_and_get_stats(dataset_type='cat', task=task, target_list=OR_names, effect='agon', assay='G_GTP', std_types=["EC50"], 
                                    input_path=CAT_DATASETS_DIR, output_path= CURA_CAT_DATASETS_DIR)
        curate_datasets_and_get_stats(dataset_type='cat', task=task, target_list=OR_names, effect='agon', assay='G_Ca', std_types=["EC50"], 
                                    input_path=CAT_DATASETS_DIR, output_path= CURA_CAT_DATASETS_DIR)
        curate_datasets_and_get_stats(dataset_type='cat', task=task, target_list=OR_names, effect='agon', assay='G_cAMP', std_types=["IC50", "EC50"], 
                                    input_path=CAT_DATASETS_DIR, output_path= CURA_CAT_DATASETS_DIR)
        curate_datasets_and_get_stats(dataset_type='cat', task=task, target_list=OR_names, effect='agon', assay='B_arrest', std_types=["EC50"], 
                                    input_path=CAT_DATASETS_DIR, output_path= CURA_CAT_DATASETS_DIR)
        
        ## Antagonism
        curate_datasets_and_get_stats(dataset_type='cat', task=task, target_list=OR_names, effect='antag', assay='G_GTP', std_types=["IC50", "Ki", "Kb", "Ke"], 
                                    input_path=CAT_DATASETS_DIR, output_path= CURA_CAT_DATASETS_DIR)
        curate_datasets_and_get_stats(dataset_type='cat', task=task, target_list=OR_names, effect='antag', assay='B_arrest', std_types=["IC50"], 
                                    input_path=CAT_DATASETS_DIR, output_path= CURA_CAT_DATASETS_DIR)
        
        ## ==== het data for ORs ====
        print('Processing heterogeneous data of ORs...')
        if os.path.exists(os.path.join(CURA_HET_DATASETS_DIR, task)):
            # remove the directory and its contents
            shutil.rmtree(os.path.join(CURA_HET_DATASETS_DIR, task))
            
        curate_datasets_and_get_stats(dataset_type='het', task=task, target_list=OR_names, effect=None, assay=None, std_types=["Ki", "IC50", 'EC50'],
                                    input_path=HET_DATASETS_DIR, output_path=CURA_HET_DATASETS_DIR)

        ## ==== het data for GPCRs ====
        print('Processing heterogeneous data of GPCRs...') 
        if os.path.exists(os.path.join(CURA_GPCR_DATASETS_DIR, task)):
            # remove the directory and its contents
            shutil.rmtree(os.path.join(CURA_GPCR_DATASETS_DIR, task))
        GPCR_chembl_ids = [id for id in os.listdir(HET_GPCR_DIR) if os.path.isdir(os.path.join(HET_GPCR_DIR, id))]
        curate_datasets_and_get_stats(dataset_type='het', task=task, target_list=GPCR_chembl_ids, effect=None, assay=None, std_types=["Ki", "IC50", 'EC50'],
                                    input_path=HET_GPCR_DIR, output_path=CURA_GPCR_DATASETS_DIR)

In [None]:
main()

----------->Task is cls

Processing categorized datasets of ORs...
Processing mor_bind_RBA_Ki...
The length of the raw dataset is 5557
Curating dataset
start standardizing with value
After standardizing the SMILES, the shape of the df: (5403, 34)
After dropping the mols with MW > 900.0 , the shape of the df: (5206, 34)
After removing the mols with multiple values, the shape of the df:(4523, 35)
start applying thresholds
Applying thresholds 
The length of df_novalue is 154
After standardizing the SMILES, the shape of the df: (154, 34)
After dropping the mols with MW > 900.0 , the shape of the df: (144, 34)
After dropping the duplicate combinations of (smiles, value) , the shape of the df:(131, 35)
Done curation.

Processing mor_bind_RBA_IC50...
The length of the raw dataset is 687
Curating dataset
start standardizing with value
After standardizing the SMILES, the shape of the df: (665, 34)
After dropping the mols with MW > 900.0 , the shape of the df: (630, 34)
After removing the mols w

Failed curating the dataset due to cannot set a frame with no defined index and a scalar


After standardizing the SMILES, the shape of the df: (47, 34)
After dropping the mols with MW > 900.0 , the shape of the df: (2, 34)
After removing the mols with multiple values, the shape of the df:(1, 35)
start applying thresholds
Applying thresholds 

Processing mor_antag_G_GTP_IC50...
The length of the raw dataset is 377
Curating dataset
start standardizing with value
After standardizing the SMILES, the shape of the df: (301, 34)
After dropping the mols with MW > 900.0 , the shape of the df: (294, 34)
After removing the mols with multiple values, the shape of the df:(268, 35)
start applying thresholds
Applying thresholds 
The length of df_novalue is 76
After standardizing the SMILES, the shape of the df: (76, 34)
After dropping the mols with MW > 900.0 , the shape of the df: (76, 34)
After dropping the duplicate combinations of (smiles, value) , the shape of the df:(71, 35)
Done curation.

Processing mor_antag_G_GTP_Ki...
The length of the raw dataset is 63
Curating dataset
start s