In [None]:
import os

from datacat4ml.utils import mkdirs, get_df_name
from datacat4ml.const import *
#from datacat4ml.Scripts.data_prep.data_split.split_utils 

# Split the heterogenous datasets of ORs
based on the train_test_split of categorized datasets

In [None]:
def merged_data_spliter(task:str='cls', confidence_score:int=9, thr_class:int=7, use_clustering: bool=True,
                        target:str='mor', std_type:str='Ki', effect:str='antag', assay:str='G_GTP'):
    
    """
    Split the merged data into train and test sets based on the corresponding categorized data.
    """
    # load merged data
    merged_df  = pd.read_csv(os.path.join(FETCH_DATA_DIR, 'curated', task, 'confidence_score'+'_'+str(confidence_score), 
                                               'thr_class'+'_'+str(thr_class), f"{target}_{std_type}_final.csv")).drop(columns=['Unnamed: 0'])
    # load categorized data
    cat_df = pd.read_csv(os.path.join(DATASETS_DIR, 'random_splited', task, 'confidence_score'+'_'+str(confidence_score),
                                                                'thr_class'+'_'+str(thr_class), 'use_clustering'+'_'+str(use_clustering),
                                                                f"{target}_{effect}_{assay}_{std_type}_final.csv")).drop(columns=['Unnamed: 0'])
    # for cat_df
    # get indices of 'test' rows in categorized data
    cat_test_idx = cat_df[cat_df['split'] == 'test'].index.tolist()
    cat_test_smiles = cat_df.iloc[cat_test_idx]['smiles'].tolist()

    # for merged_df
    # identify indices of 'CuratedSmiles' in merged data that are in categorized test smiles
    mer_test_idx = merged_df[merged_df['CuratedSmiles'].isin(cat_test_smiles)].index.tolist()
    # add a 'split' column to merged data, setting 'test' for test indices and 'train' for others
    merged_df['split'] = ['test' if i in mer_test_idx else 'train' for i in range(len(merged_df))]
    # add a column 'cliff_mol'  
    merged_df['cliff_mol'] = np.nan
    # populate 'cliff_mol' for rows in test indices using values from categorized data
    for i in mer_test_idx:
        merged_df.loc[i, 'cliff_mol'] = cat_df.loc[
            cat_df[cat_df['smiles'] == (merged_df.loc[i, 'CuratedSmiles'])].index[0], 
            'cliff_mol']
        
    if task == 'cls':
        splited_merged_df= pd.DataFrame({'molecule_chembl_id': merged_df['molecule_chembl_id'].tolist(),
                                        'smiles': merged_df['CuratedSmiles'].tolist(),
                                        'exp_mean [nM]': (10**abs(np.array(merged_df['pstandard_value_mean']))).tolist(),
                                        'pstandard_value_mean': merged_df['pstandard_value_mean'].tolist(),
                                        'cliff_mol': merged_df['cliff_mol'].tolist(),
                                        'y(activity)': merged_df['activity'].tolist(),
                                        'split': merged_df['split'].tolist(),
                                        })

        
    elif task == 'reg':
        splited_merged_df= pd.DataFrame({'molecule_chembl_id': merged_df['molecule_chembl_id'].tolist(),
                                        'smiles': merged_df['CuratedSmiles'].tolist(),
                                        'exp_mean [nM]': (10**abs(np.array(merged_df['pstandard_value_mean']))).tolist(),
                                        'y(pstandard_value_mean)':merged_df['pstandard_value_mean'].tolist(),
                                        'cliff_mol': merged_df['cliff_mol'].tolist(),
                                        'activity':merged_df['activity'].tolist(),
                                        'split':merged_df['split'].tolist()})
    splited_merged_df['file_path'] = 'merged'
    splited_merged_df['task'] = task
    splited_merged_df['confidence_score'] = confidence_score
    splited_merged_df['thr_class'] = thr_class
    splited_merged_df['use_clustering'] = use_clustering
    splited_merged_df['target'] = target
    splited_merged_df['std_type'] = std_type
    splited_merged_df['effect'] = effect    
    splited_merged_df['assay'] = assay
    
    # save splited_merged_df as csv file
    output_folder = os.path.join(FETCH_DATA_DIR, 'assaywise_splited', task, 'confidence_score'+'_'+str(confidence_score),
                                    'thr_class'+'_'+str(thr_class), 'use_clustering'+'_'+str(use_clustering))
    mkdirs(output_folder) # make directory if not exist
    splited_merged_df.to_csv(os.path.join(output_folder, f"{target}_{std_type}_{effect}_{assay}_assaywise-splited.csv"))

    return splited_merged_df

In [None]:
for task in Tasks:
    for confidence_score in Confidence_scores:
        for thr_class in Thr_classes:
            for use_clustering in Use_clusterings:
                folder_path = os.path.join(DATASETS_DIR, 'curated', task, 'confidence_score'+'_'+str(confidence_score), 'thr_class'+'_'+str(thr_class))
                files = os.listdir(folder_path)
                dfs = []
                for file in files:
                    df = pd.read_csv(os.path.join(folder_path, file)).drop(columns=['Unnamed: 0'])
                    dfs.append(df)
                print(f"Number of files: {len(dfs)}")
                for df in dfs:
                    if not df.empty:
                        # assign the first value in 'target' column to the variable target
                        target = df['target'].tolist()[0]
                        effect = df['effect'].tolist()[0]
                        assay = df['assay'].tolist()[0]
                        std_type = df['std_type'].tolist()[0]

                        print(target, effect, assay, std_type)
                        try:
                            merged_data_spliter(task=task, confidence_score=confidence_score, thr_class=thr_class, use_clustering=use_clustering,
                                            target=target, std_type=std_type, effect=effect, assay=assay)
                        # print the error message
                        except Exception as e:
                            warnings.warn(f"Error in {target} {effect} {assay} {std_type}:\n{e}"    )
                    else:
                        print("Empty DataFrame detected.")

# Split