In [1]:
# conda env: pyg (Python3.9.16)
import os
import sys
from typing import List

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.cluster import SpectralClustering
#from OpioML.Scripts.data_prep.data_split.cliff import ActivityCliffs, get_tanimoto_matrix

from datacat4ml.utils import mkdirs, get_df_name
from datacat4ml.const import *

# Split the categorized datasets

In [None]:
def data_spliter(smiles: List[str], bioactivity: List[float], active: List[float],
               in_log10: bool=True, similarity: float=0.9, potency_fold: int=10,
               use_clustering: bool=True, n_clusters: int=5, test_size: float=0.2, 
               task: str='cls'):
    """
    Split the data into train and test sets according to activity cliffs and compound charateristics.

    :param smiles: List of SMILES strings
    :param bioactivity: List of bioactivity values
    :param active: List of active/inactive labels
    :param in_log10: Whether the bioactivity values are in log10
    :param similarity: Threshold value to determine structural similarity
    :param potency_fold: Threshold value to determine potency difference
    :param test_size: Test set size
    :param use_clustering: Whether to use clustering to split the data
    :param n_clusters: Number of clusters to use if clustering is used

    :return: A dataframe 
    """

    if len(smiles) < 50:
        print(f"The number of data points in this file is lower than 50, which is not enough to build a good machine learning model. Skipping this file.")
        return None

    if not in_log10:
        bioactivity = (-np.log10(bioactivity)).tolist()
    
    # get activity cliffs
    cliffs = ActivityCliffs(smiles, bioactivity)
    cliff_mols = cliffs.get_cliff_molecules(return_smiles=False, similarity=similarity, potency_fold=potency_fold)

    if use_clustering:
        # # cluster the dabaset into 5 clusters based on tanimoto distance matrix
        spectral = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', random_state=RANDOM_SEED)
        clusters = spectral.fit(get_tanimoto_matrix(smiles)).labels_ # get the cluster labels for each molecule

        train_idx, test_idx = [], []
        for cluster in range(n_clusters):
            num_in_cluster = len(np.where(clusters == cluster)[0])
            print(f"Cluster {cluster}: {num_in_cluster} data points")

            # get the indices of molecules in the current cluster
            cluster_idx = np.where(clusters == cluster)[0] # `[0]` is need to convert the tuple to a list
            clust_cliff_mols = [cliff_mols[i] for i in cluster_idx]
            
            # can only split data stratified by cliff molecules if there are more than 1 cliff molecules in the cluster, else just split randomly
            if sum(clust_cliff_mols) > 2:
                try: # try to avoid the Value ERROR when the number of class in the training set is less than 2
                    clust_train_idx, clust_test_idx = train_test_split(cluster_idx, test_size=test_size, 
                                                                        stratify=clust_cliff_mols, # ensure that the train/test split has the same proportion of cliff molecules
                                                                        random_state=RANDOM_SEED, shuffle=True)
                except ValueError:
                    # This block will be executed if an error occurs in the try block
                    print("An error occurred while stratifying data based on cliff molecules. Skipping stratification.")
                    clust_train_idx, clust_test_idx = train_test_split(cluster_idx, test_size=test_size, 
                                                                        random_state=RANDOM_SEED, shuffle=True)
            else:
                clust_train_idx, clust_test_idx = train_test_split(cluster_idx, test_size=test_size, 
                                                                    random_state=RANDOM_SEED, shuffle=True)
            train_idx.extend(clust_train_idx)
            test_idx.extend(clust_test_idx)
    else:
        # don't use clustering before spliting. This is to avoid potential cheating by using clustering to split the data, 
        # which make the structual diversity of the train and test sets the same
        if sum(cliff_mols) > 2:
            train_idx, test_idx = train_test_split(range(len(smiles)), test_size=test_size, 
                                            stratify=cliff_mols, # ensure that the train/test split has the same proportion of cliff molecules
                                            random_state=RANDOM_SEED, shuffle=True)
        else:
            train_idx, test_idx = train_test_split(range(len(smiles)), test_size=test_size, 
                                            random_state=RANDOM_SEED, shuffle=True)
    
    train_test = []
    for i in range(len(smiles)):
        if i in train_idx:
            train_test.append('train')
        elif i in test_idx:
            train_test.append('test')
        else:
            raise ValueError('Index not in train or test set')
        
    if task == 'cls':
        return pd.DataFrame({'smiles': smiles,
                            'exp_mean [nM]': (10**abs(np.array(bioactivity))).tolist(),
                            'pstandard_value_mean': bioactivity,
                            'cliff_mol': cliff_mols,
                            'y(active)': active,
                            'split': train_test})

        
    elif task == 'reg':
        return pd.DataFrame({'smiles': smiles,
                             'exp_mean [nM]': (10**abs(np.array(bioactivity))).tolist(),
                             'y(pstandard_value_mean)':bioactivity,
                             'cliff_mol': cliff_mols,
                             'active':active,
                             'split':train_test})  

In [None]:
def split_data(filepath = FETCH_DATA_DIR, task:str = 'cls', confidence_score:int = 8,
               use_clustering: bool=True):
    
    # access the final csv files obtained from data curation
    folder_path = os.path.join(filepath, 'curated', task, 'confidence_score'+'_'+str(confidence_score))
    files = os.listdir(folder_path)
    final_files = [file for file in files if file.endswith('_final.csv')]

    # make new directory to store the featurized data
    new_path = os.path.join(filepath, 'splited', task, 'confidence_score'+'_'+str(confidence_score), 'use_clustering'+'_'+str(use_clustering))
    mkdirs(new_path)

    for final_file in final_files:
        print (f"final_file is: {final_file}\n")

        df = pd.read_csv(os.path.join(folder_path, final_file))
        df = df.drop(columns=['Unnamed: 0'])

        # split the data into train and test sets
        data_splited_df = data_spliter(df['CuratedSmiles'].tolist(), df['pstandard_value_mean'].tolist(), df['active'].tolist(), 
                          task=task, use_clustering=use_clustering)
        

        # save data_splited_df as a csv file if it is not None
        if data_splited_df is not None:
            data_splited_df.to_csv(os.path.join(new_path, final_file))

In [None]:
use_clusterings = [True, False]

for filepath in File_paths:
    print (f"filepath is: {filepath}\n")

    for task in Tasks:
        print (f"task is: {task}\n")

        for confidence_score in Confidence_scores:
            print (f"confidence_score is: {confidence_score}\n")

            for use_clustering in use_clusterings:
                print (f"use_clustering is: {use_clustering}\n")
                
                split_data(filepath, task, confidence_score, use_clustering=use_clustering)
                print(f'Done!\n====================================\n')
#It took around 1 hour to run the above code locally