In [None]:
import os
from typing import List
from tqdm import tqdm
import argparse

import numpy as np
import pandas as pd

# for `def random_split`
import random
from sklearn.model_selection import StratifiedKFold
# for `def cluster_aware_split`
from rdkit.ML.Cluster import Butina 
# for molecular distance calculations
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from rdkit.Chem.Scaffolds.MurckoScaffold import MakeScaffoldGeneric as GraphFramework
from rdkit.Chem.Scaffolds.MurckoScaffold import GetScaffoldForMol
from Levenshtein import distance as levenshtein

from datacat4ml.Scripts.const import RANDOM_SEED
from datacat4ml.Scripts.const import CURA_LHD_OR_DIR, CURA_MHD_OR_DIR, CURA_MHD_effect_OR_DIR, CURA_HHD_OR_DIR
from datacat4ml.Scripts.const import SPL_DATA_DIR, SPL_LHD_OR_DIR, SPL_MHD_OR_DIR, SPL_MHD_effect_OR_DIR, SPL_HHD_OR_DIR

# Functions

In [15]:
#===============================================================================
# Molecular distance 
#===============================================================================
"""Adopted from https://github.com/molML/MoleculeACE/blob/main/MoleculeACE/benchmark/cliffs.py"""
#  Substructure distance or similarity based on morgan fingerprint
def get_substructure_mat(smiles: List[str], radius: int = 2, nBits: int = 1024, distance: bool=True):

    """ 
    Calculates a matrix of Tanimoto distance or similarity scores for the whole molecules of a list of SMILES string.
    
    This method capture the “global” differences or similarity between molecules by considering the entire set of substructures they contain
    """

    if distance:
        returnDistance = 1 # return distance matrix. Distance = 1 - Similarity.
    else:
        returnDistance = 0 # return similarity matrix

    fps = []
    for smi in smiles:
        mol = Chem.MolFromSmiles(smi)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
        fps.append(fp)

    # generate the distance/similarity matrix based on the fingerprints:
    mat = np.zeros((len(fps), len(fps)), float)
    for i, fp in enumerate(fps):
        if i == len(fps) - 1:
            break
        ds = np.array(
            DataStructs.BulkTanimotoSimilarity(fp,
                                               fps[i + 1:],
                                               returnDistance=returnDistance))
        mat[i, i + 1:] = ds
        mat[i + 1:, i] = ds

    return mat

# scaffold distance/similarity based on morgan fingerprint
def get_scaffold_mat(smiles: List[str], radius: int = 2, nBits: int = 1024, distance: bool=True):
    """ Calculates a matrix of Tanimoto distance/similarity scores for the scaffolds of a list of SMILES string """
    
    if distance:
        returnDistance = 1 # return distance matrix. Distance = 1 - Similarity.
    else:
        returnDistance = 0 # return similarity matrix
    
    scaf_fps = {}
    for smi in smiles:
        mol = Chem.MolFromSmiles(smi)
        try:
            skeleton = GraphFramework(mol) # returns the generic scaffold graph, whcih represents the connectivity and topology of the molecule
        except Exception: # In the very rare case that the molecule cannot be processed, then use a normal scaffold
            print(f"Could not create a generic scaffold of {smi}, then used a normal scaffold instead")
            skeleton = GetScaffoldForMol(mol) # returns the Murcko scaffold, which is the result of removing side chains while retaining ring systems
        skeleton_fp = AllChem.GetMorganFingerprintAsBitVect(skeleton, radius=radius, nBits=nBits)
        scaf_fps.append(skeleton_fp)

    mat = np.zeros((len(smiles), len(smiles)), float)
    for i, scaf_fp in enumerate(scaf_fps):
        if i == len(scaf_fps) - 1:
            break
        ds = np.array(
            DataStructs.BulkTanimotoSimilarity(scaf_fp,
                                               scaf_fps[i + 1:],
                                               returnDistance=returnDistance))
        mat[i, i + 1:] = ds
        mat[i + 1:, i] = ds

    return mat

# levenstein distance/similarity based on SMILES strings
def get_levenshtein_mat(smiles: List[str], normalize: bool = True, distance: bool=True):
    """ Calculates a matrix of levenshtein distance/similarity scores for a list of SMILES string
    Levenshtein distance/similarity, i.e edit distance/similarity, measures the number of single character edits (insertions, deletions or substitutions) required to change one string into the other.
    As SMILES is a text-based representation of a molecule, this similarity metric can be used to measure the similarity between two molecules.
    
    """
    smi_len = len(smiles)

    mat = np.zeros([smi_len, smi_len])
    # calcultate the upper triangle of the matrix
    for i in tqdm(range(smi_len)):
        for j in range(i, smi_len):
            # normalized to [0,1] or not
            if normalize: 
                dist = levenshtein(smiles[i], smiles[j]) / max(len(smiles[i]), len(smiles[j]))
                sim = 1 - dist
            else:
                dist = levenshtein(smiles[i], smiles[j])
                sim = 1.0 / (1 + dist)
            
            # return distance or similarity matrix
            if distance:
                mat[i, j] = dist
            else:
                mat[i, j] = sim
    # fill in the lower triangle without having to loop (saves ~50% of time)
    mat = mat + mat.T - np.diag(np.diag(mat))
    # get from a distance matrix to a similarity matrix
    mat = 1 - mat

    # fill the diagonal with 0's
    np.fill_diagonal(mat, 0)

    return mat

def molecule_similarity(smiles: List[str], similarity: float = 0.9,):
    """ Calculate which pairs of molecules have a high substructure, scaffold, or SMILES similarity """
    m_subs = get_substructure_mat(smiles) <= (1 - similarity)
    m_scaff = get_scaffold_mat(smiles) <= (1 - similarity)
    m_leve = get_levenshtein_mat(smiles) <= (1 - similarity)

    return (m_subs + m_scaff + m_leve).astype(int)

def find_stereochemical_siblings(smiles: List[str]):
    """
    Detects molecules that have different SMILES strings, but encode for the same molecule with different stereochemistry. 
    For racemic mixtures it is often unclear which one is measured/active

    returns:
        pair_smis: List of pairs of SMILES strings that are stereochemical siblings
        pair_idx: List of pairs of indices corresponding to the SMILES strings in the input list
    """
    smat_lower = np.tril(get_substructure_mat(smiles, radius=4, nBits=4096, distance=False), k=0)
    identical = np.where(smat_lower == 1) # identical[0] is the row indices, identical[1] is the column indices

    pair_idx = []
    pair_smis = []
    for i, j in zip(identical[0], identical[1]):
        pair_idx.append([i, j])
        pair_smis.append([smiles[i], smiles[j]])
    #print(f'pair_idx is \n{pair_idx}')
    #print(f'len(pair_smis): {len(pair_smis)}')

    return pair_smis, pair_idx

#===============================================================================
# Data splitting methods
#===============================================================================
# random split
def random_split(x, y, n_folds=5, random_seed=RANDOM_SEED):
    """
    randomly split the dataset into training and testing sets for n_folds times, stratified on y

    params
    ------
    - x: list or np.array
        input features
    - y: list or np.array
        target values # StratifiedKFold will stratifiy based on y
    - n_folds: int
        number of folds for cross-validation. The test_size will be 1/n_folds automatically.

    returns
    -----------
    - test_splits: List[np.ndarray]
        A list of lists, each containing the indices of the test set for each fold.
    """
    skf = StratifiedKFold(n_splits=n_folds, random_state=random_seed, shuffle=True)
    train_folds, test_folds = [], []
    for train_idx, test_idx in skf.split(x, y):
        train_folds.append((train_idx.tolist()))
        test_folds.append((test_idx.tolist()))

    return train_folds, test_folds

# cluster-aware split
"""Adopted from https://github.com/rinikerlab/molecular_time_series/blob/main/ga_lib_3.py#L1171"""
def clusterData(dmat, threshold, clusterSizeThreshold, combineRandom=False):
    """ 
    Cluster data based on a distance matrix using the Butina algorithm.

    Params
    ------
    - dmat: a distance matrix get from `get_substructure_matrix`, `get_scaffold_matrix`, or `get_levenshtein_matrix`
    - threshold: float, the distance threshold for clustering. E.g. 0.4 means mols with a similarity above 0.6 will be grouped into the same cluster.
    - clusterSizeThreshold: int, the minimum size for a cluster to be considered "large". Clusters smaller than this size will be handled according to the `combineRandom` parameter.
    - combineRandom: bool, if True, small clusters will be combined randomly to form larger clusters. If False, points from small clusters will be added to the nearest larger cluster based on the distance matrix.

    Returns
    -------
    - largeClusters: List of clusters, where each cluster is represented as a list of indices
    """
    nfps = len(dmat)
    symmDmat = []
    for i in range(1, nfps):
        symmDmat.extend(dmat[i, :i]) # convert a square distance matrix to a 1D array representing the upper triangle of the matrix
    cs = Butina.ClusterData(symmDmat, nfps, threshold, isDistData=True, reordering=True)
    cs = sorted(cs, key=lambda x: len(x), reverse=True) # sort clusters by size in descending order

    # start with the large clusters:
    largeClusters = [list(c) for c in cs if len(c) >= clusterSizeThreshold]
    if not largeClusters:
        raise ValueError("no clusters found")
    # now combine the small clusters to make larger ones:
    if combineRandom:
        tmpCluster = []
        for c in cs:
            if len(c) >= clusterSizeThreshold:
                continue
            tmpCluster.extend(c)
            if len(tmpCluster) >= clusterSizeThreshold:
                random.shuffle(tmpCluster)
                largeClusters.append(tmpCluster)
                tmpCluster = []
        if tmpCluster:
            largeClusters.append(tmpCluster)
    else:
        # add points from small clusters to the nearest larger cluster
        #  nearest is defined by the nearest neighbor in that cluster
        for c in cs:
            if len(c) >= clusterSizeThreshold:
                continue
            for idx in c:
                closest = -1
                minD = 1e5
                for cidx, clust in enumerate(largeClusters):
                    for elem in clust:
                        d = dmat[idx, elem]
                        if d < minD:
                            closest = cidx
                            minD = d
                assert closest > -1
                largeClusters[closest].append(idx)
    return largeClusters

def cluster_aware_split(dist_type, x, clusterSizeThreshold=5, threshold=0.65, combineRandom=False, 
                        random_seed=RANDOM_SEED, test_size=0.2,n_samples=5, selectionStrategy='clust_holdout'):
    """
    Assigns data points to training and testing sets based on selection strategies using clustering.

    Params
    ------
    - dist_type: str, the type of distance metric to use. Options are 'substruct', 'scaf', and 'levenshtein'.
    - x: list or np.array
        input features (e.g., list of SMILES strings)
    - clusterSizeThreshold: int, the minimum size for a cluster to be considered "large". Clusters smaller than this size will be handled according to the `combineRandom` parameter.
    - threshold: float, the distance threshold for clustering. Molecules with distances below this threshold will be grouped into the same cluster.
    - combineRandom: bool, if True, small clusters will be combined randomly to form larger clusters. If False, points from small clusters will be added to the nearest larger cluster based on the distance matrix.
    
    - randomSeed: int, seed for random number generator to ensure reproducibility.
    - test_size: float, the proportion of the dataset to include in the test split.
    - n_samples: int, the number of different train-test splits to generate.
        Here, n_samples is different from n_folds in `random_split`. 
        - n_samples is the number of repeated samplings of train-test splits, each split is independent from each other.
        - n_folds is the number of folds in cross-validation, each fold won't have identical data points.
    - selectionStrategy: SelectionStrategy, the strategy to use for selecting test samples.
        - 'cluster_stratified' ensures each fold has different data points for all kinds of datasets (hhd, mhd, lhd, small or large)
        - 'cluster_holdout' only ensure each fold for mhd and hhd has different data points, but for lhd or very small datasets, some folds may have identical data points
    
    Returns
    -------
    - test_folds: List[np.ndarray]
        A list of lists, each containing the indices of the test set for each sampling.
    - train_folds: List[np.ndarray]
        A list of lists, each containing the indices of the train set for each sampling.

    """

    # get distance matrix
    if dist_type == 'substruct':
        dmat = get_substructure_mat(x)
    elif dist_type == 'scaf':
        dmat = get_scaffold_mat(x)
    elif dist_type == 'levenshtein':
        dmat = get_levenshtein_mat(x)

    # cluster the data
    clusterSizeThreshold=max(5, len(x)/50) # set a minimum cluster size based on the dataset size
    largeClusters = clusterData(dmat, threshold, clusterSizeThreshold, combineRandom)

    # assign data into train and test sets
    random.seed(random_seed) # set the random seed for reproducibility
    nTest= round(len(dmat)*test_size)

    test_folds = [] # list of lists, each containing the indices of the test samples for each split
    train_folds = []

    for i in range(n_samples): 
        # ensure distributional overlap between train and test splits -> easier task
        if selectionStrategy == 'clust_stratified': 
            ordered = []
            for c in largeClusters:
                random.shuffle(c) # shuffle the 
                ordered.extend((i / len(c), x) for i, x in enumerate(c))
            ordered = [y for x, y in sorted(ordered)]
            test=ordered[:nTest]
            train=ordered[nTest:]

        # ensure cluster disjointness - train and test cover different regions of chemical space -> harder task
        elif selectionStrategy == 'clust_holdout': 
            random.shuffle(largeClusters)
            test = []
            train = []
            for c in largeClusters:
                if len(test) < nTest:
                    nRequired = nTest - len(test)
                    test.extend(c[:nRequired])
                    train.extend(c[nRequired:])
                else: 
                    train.extend(c) # all remaining clusters go to train set
                    
        test_folds.append(test)
        train_folds.append(train)

    return train_folds,test_folds
#===============================================================================
# Internal splitting
#===============================================================================
Cura_Spl_Dic = {CURA_HHD_OR_DIR: SPL_HHD_OR_DIR,
                 CURA_MHD_OR_DIR: SPL_MHD_OR_DIR,
                 CURA_LHD_OR_DIR: SPL_LHD_OR_DIR,
                 CURA_MHD_effect_OR_DIR: SPL_MHD_effect_OR_DIR
                 }

def add_fold_columns(df, prefix, train_folds, test_folds):
    """
    Add 'train/test' columns to df for each fold using prefix.
    """
    df = df.copy()
    for i, (train_idx, test_idx) in enumerate(zip(train_folds, test_folds)):
        fold_col = f"{prefix}_fold{i}"
        df[fold_col] = np.nan # initialize the column with NaN

        # mark train/test according to the fold indices
        df.loc[train_idx, fold_col] = 'train'
        df.loc[test_idx, fold_col] = 'test'
    return df

def random_splitter(df, n_folds, aim):
    """
    Apply stratified random split to the dataset for two stereo modes:
    - rmvStereo0: full dataset
    - rmvStereo1: dataset without stereochemical siblings

    params
    ------
    - df: pd.DataFrame.
    - n_folds: int, number of folds for cross-validation. The test_size will be 1/n_folds automatically.
    - aim: str, the aim of the model build for. Options are 'lo' and 'vs'. It is used to name the split columns.

    returns
    -----------
    - df: pd.DataFrame
        a new df with additional columns for the random splits
    """
    df_result = df.copy()
    activity_col = 'lo_activity' if aim == 'lo' else 'vs_activity'

    # Internal helper
    def _safe_split(sub_df, prefix, n_folds, random_seed=RANDOM_SEED):
        """Perform safe stratified split and return updated df or None."""
        if len(sub_df) == 0:
            print(f"{prefix}: skipped — no data available.")
            return None
        
        x = sub_df['canonical_smiles_by_Std'].tolist()
        y = sub_df[activity_col].tolist()

        # Adjust folds if class imbalance prevents stratification
        unique, counts = np.unique(y, return_counts=True)
        min_class_count = min(counts)
        if min_class_count < n_folds:
            print(f"{prefix}: resetting n_folds {n_folds} → {min_class_count} due to class imbalance.")
            n_folds = min_class_count

        if n_folds < 2:
            print(f"{prefix}: skipped — k-fold CV not applicable (n_folds < 2).")
            return None

        try:
            train_folds, test_folds = random_split(x, y, n_folds, RANDOM_SEED)
            df_split = add_fold_columns(sub_df, prefix, train_folds, test_folds)
            return df_split
        except ValueError as e:
            print(f"{prefix}: skipped due to {e}")
            return None

    # --- Perform both splits ---
    df_rmvStereo0 = _safe_split(df, f"int.rmvStereo0_rs_{aim}", n_folds)
    df_rmvStereo1_sub = _safe_split(df[df['stereoSiblings'] == False].reset_index(drop=True),
                                    f"int.rmvStereo1_rs_{aim}", n_folds)

    # --- Merge results ---
    if df_rmvStereo0 is not None:
        df_result = df_rmvStereo0.copy()

    if df_rmvStereo1_sub is not None:
        merge_cols = ['activity_id'] + [col for col in df_rmvStereo1_sub.columns if col.startswith(f"int.rmvStereo1_rs_{aim}")]
        df_result = df_result.merge(df_rmvStereo1_sub[merge_cols], on='activity_id', how='left')

    print("Random splitting completed.")

    return df_result

def cluster_aware_splitter(df, selectionStrategy):
    """
    Apply cluster-aware split and add new columns for train/test folds.

    params
    ------
    - df: pd.DataFrame.
    - selectionStrategy: SelectionStrategy, the strategy to use for selecting test samples. Options are 'clust_stratified' and 'clust_holdout'.

    returns
    -----------
    - df: pd.DataFrame
        a new df with additional columns for the cluster-aware splits
    """

    sS = 'cs' if selectionStrategy == 'clust_stratified' else 'ch'
    df_result = df.copy()

    def _safe_cluster_split(sub_df, prefix, selectionStrategy, random_seed=RANDOM_SEED):
        """Perform a safe cluster-aware split and return df with new columns or None."""
        if len(sub_df) == 0:
            print(f"{prefix}: skipped — empty subset.")
            return None

        x = sub_df['canonical_smiles_by_Std'].tolist()

        try:
            train_folds, test_folds = cluster_aware_split(
                dist_type='substruct',
                selectionStrategy=selectionStrategy,
                x=x,
                threshold=0.65,
                combineRandom=False,
                random_seed=random_seed,
                test_size=0.2,
                n_samples=2
            )

            # Check and deduplicate folds if necessary
            tupled_test_folds = [tuple(sorted(fold)) for fold in test_folds]
            if len(tupled_test_folds) != len(set(tupled_test_folds)):
                print(f"{prefix}: duplicate test folds detected. Keeping unique ones.")
                test_folds = list(set(tupled_test_folds))

            # Add fold columns
            return add_fold_columns(sub_df, prefix, train_folds, test_folds)

        except ValueError as e:
            print(f"{prefix}: cluster-aware split skipped due to {e}")
            return None

    # --- Run for both rmvStereo0 and rmvStereo1 ---
    df_rmvStereo0 = _safe_cluster_split(df, f"int.rmvStereo0_{sS}", selectionStrategy)
    df_rmvStereo1_sub = _safe_cluster_split(
        df[df['stereoSiblings'] == False].reset_index(drop=True),
        f"int.rmvStereo1_{sS}",
        selectionStrategy
    )

    # --- Merge results ---
    if df_rmvStereo0 is not None:
        df_result = df_rmvStereo0.copy()

    if df_rmvStereo1_sub is not None:
        merge_cols = ['activity_id'] + [col for col in df_rmvStereo1_sub.columns if col.startswith(f"int.rmvStereo1_{sS}")]
        df_result = df_result.merge(df_rmvStereo1_sub[merge_cols], on='activity_id', how='left')

    print(f"Cluster-aware split completed for strategy '{selectionStrategy}'.")
    return df_result

def internal_split(in_dir: str = CURA_HHD_OR_DIR, rmv_dupMol: int = 1):
    """
    Split data into train-test folds for file(s) in the input directory.

    params
    ------
    - in_dir: str, input directory containing files to be split.
    """
    # input directory
    print(f"in_dir is: {in_dir}\n")
    in_file_dir = os.path.join(in_dir, f'rmvDupMol{str(rmv_dupMol)}')
    files = os.listdir(in_file_dir)

    # output directory
    out_dir = os.path.join(Cura_Spl_Dic[in_dir], f'rmvDupMol{str(rmv_dupMol)}')
    print(f"out_dir is: {out_dir}\n")
    os.makedirs(out_dir, exist_ok=True)

    # split data
    for f in files:

        print (f"\ninput_file is: {f}\n")
        df = pd.read_csv(os.path.join(in_file_dir, f))
        
        # skip files with less than 40 data points
        #if len(df) < 40:
        #    print(f"Skip {f}, because it has less than 40 data points, not enough for building ML models")
        #else:

        # apply split
        print("random split...")
        df = random_splitter(df, n_folds=5, aim='lo')
        df = random_splitter(df, n_folds=5, aim='vs')

        print("cluster-aware split...")
        df = cluster_aware_splitter(df, selectionStrategy='clust_stratified')
        df = cluster_aware_splitter(df, selectionStrategy='clust_holdout')
        
        # save the new df
        out_file = os.path.join(out_dir, f[:-12] + f"_split.csv")
        df.to_csv(out_file, index=False)

# Main

In [16]:
# read curated data
in_path = '/storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/data_prep/data_curate/cura_hhd_or'
csv_file = 'CHEMBL233_None_None_Ki_None_hhd_b50_b50_curated.csv'

cura_df = pd.read_csv(os.path.join(in_path, 'rmvDupMol0', csv_file))
print(f'cura_df shape: {cura_df.shape}')
cura_df.head()

cura_df shape: (5258, 50)


Unnamed: 0,activity_id,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,standard_relation,standard_value,standard_units,pchembl_value,...,vs_activity,vs_threshold,lo_activity_comment,lo_activity,lo_threshold,stereoSiblings,effect,assay,effect_description,assay_keywords_description
0,32316,148100,CHEMBL751582,129,CHEMBL233,Ki,=,10620.0,nM,4.97,...,0.0,5.0,inactive,0.0,6.3,False,,,,
1,32321,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1591.0,nM,5.8,...,1.0,5.0,inactive,0.0,6.3,True,,,,
2,34724,148100,CHEMBL751582,129,CHEMBL233,Ki,=,801.0,nM,6.1,...,1.0,5.0,inactive,0.0,6.3,True,,,,
3,34729,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1599.0,nM,5.8,...,1.0,5.0,inactive,0.0,6.3,False,,,,
4,35960,148100,CHEMBL751582,129,CHEMBL233,Ki,=,865.0,nM,6.06,...,1.0,5.0,inactive,0.0,6.3,True,,,,


## random_split (train_folds, test_folds)

In [77]:
# ==>'add_fold_columns' function
x = cura_df['canonical_smiles_by_Std'].tolist()[:10]
y = cura_df['lo_activity'].tolist()[:10]

train_folds, test_folds = random_split(x, y, n_folds=2, random_seed=RANDOM_SEED)
print(f'train_folds: \n{train_folds}, \ntest_folds: \n{test_folds}')

df_split = add_fold_columns(cura_df[:10], prefix=f'int.rmvStereo0_rs_lo', train_folds=train_folds, test_folds=test_folds)
df_split

train_folds: 
[[0, 2, 4, 6, 9], [1, 3, 5, 7, 8]], 
test_folds: 
[[1, 3, 5, 7, 8], [0, 2, 4, 6, 9]]


Unnamed: 0,activity_id,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,standard_relation,standard_value,standard_units,pchembl_value,...,lo_activity_comment,lo_activity,lo_threshold,stereoSiblings,effect,assay,effect_description,assay_keywords_description,int.rmvStereo0_rs_lo_fold0,int.rmvStereo0_rs_lo_fold1
0,32316,148100,CHEMBL751582,129,CHEMBL233,Ki,=,10620.0,nM,4.97,...,inactive,0.0,6.3,False,,,,,train,test
1,32321,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1591.0,nM,5.8,...,inactive,0.0,6.3,True,,,,,test,train
2,34724,148100,CHEMBL751582,129,CHEMBL233,Ki,=,801.0,nM,6.1,...,inactive,0.0,6.3,True,,,,,train,test
3,34729,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1599.0,nM,5.8,...,inactive,0.0,6.3,False,,,,,test,train
4,35960,148100,CHEMBL751582,129,CHEMBL233,Ki,=,865.0,nM,6.06,...,inactive,0.0,6.3,True,,,,,train,test
5,35965,148100,CHEMBL751582,129,CHEMBL233,Ki,=,2176.0,nM,5.66,...,inactive,0.0,6.3,True,,,,,test,train
6,37159,148100,CHEMBL751582,129,CHEMBL233,Ki,=,2248.0,nM,5.65,...,inactive,0.0,6.3,False,,,,,train,test
7,37164,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1815.0,nM,5.74,...,inactive,0.0,6.3,True,,,,,test,train
8,38418,148100,CHEMBL751582,129,CHEMBL233,Ki,=,7159.0,nM,5.14,...,inactive,0.0,6.3,True,,,,,test,train
9,41949,148100,CHEMBL751582,129,CHEMBL233,Ki,=,586.0,nM,6.23,...,weak inactive,0.0,6.3,True,,,,,train,test


In [78]:
# ==> 'random_splitter' => rmvStereo1
df = cura_df[:10]
mask_no_stereo= df['stereoSiblings'] == False
df_sub = df.loc[mask_no_stereo].reset_index(drop=True)
x_sub = df_sub['canonical_smiles_by_Std'].tolist()
y_sub = df_sub['lo_activity'].tolist()

train_folds1, test_folds1 = random_split(x_sub, y_sub, n_folds=2, random_seed=RANDOM_SEED)
df_rmvStereo1_sub = add_fold_columns(df_sub, prefix=f'int.rmvStereo1_rs_lo', train_folds=train_folds1, test_folds=test_folds1)
df_rmvStereo1_sub

Unnamed: 0,activity_id,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,standard_relation,standard_value,standard_units,pchembl_value,...,lo_activity_comment,lo_activity,lo_threshold,stereoSiblings,effect,assay,effect_description,assay_keywords_description,int.rmvStereo1_rs_lo_fold0,int.rmvStereo1_rs_lo_fold1
0,32316,148100,CHEMBL751582,129,CHEMBL233,Ki,=,10620.0,nM,4.97,...,inactive,0.0,6.3,False,,,,,test,train
1,34729,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1599.0,nM,5.8,...,inactive,0.0,6.3,False,,,,,test,train
2,37159,148100,CHEMBL751582,129,CHEMBL233,Ki,=,2248.0,nM,5.65,...,inactive,0.0,6.3,False,,,,,train,test


In [79]:
# ==> 'random_splitter' => merge results
df_merged = df_split.copy()
df_merged = df_merged.merge(
    df_rmvStereo1_sub[['activity_id'] + [col for col in df_rmvStereo1_sub.columns if col.__contains__("rmvStereo1_rs_lo")]], 
    on='activity_id',
    how='left',
)
df_merged

Unnamed: 0,activity_id,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,standard_relation,standard_value,standard_units,pchembl_value,...,lo_threshold,stereoSiblings,effect,assay,effect_description,assay_keywords_description,int.rmvStereo0_rs_lo_fold0,int.rmvStereo0_rs_lo_fold1,int.rmvStereo1_rs_lo_fold0,int.rmvStereo1_rs_lo_fold1
0,32316,148100,CHEMBL751582,129,CHEMBL233,Ki,=,10620.0,nM,4.97,...,6.3,False,,,,,train,test,test,train
1,32321,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1591.0,nM,5.8,...,6.3,True,,,,,test,train,,
2,34724,148100,CHEMBL751582,129,CHEMBL233,Ki,=,801.0,nM,6.1,...,6.3,True,,,,,train,test,,
3,34729,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1599.0,nM,5.8,...,6.3,False,,,,,test,train,test,train
4,35960,148100,CHEMBL751582,129,CHEMBL233,Ki,=,865.0,nM,6.06,...,6.3,True,,,,,train,test,,
5,35965,148100,CHEMBL751582,129,CHEMBL233,Ki,=,2176.0,nM,5.66,...,6.3,True,,,,,test,train,,
6,37159,148100,CHEMBL751582,129,CHEMBL233,Ki,=,2248.0,nM,5.65,...,6.3,False,,,,,train,test,train,test
7,37164,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1815.0,nM,5.74,...,6.3,True,,,,,test,train,,
8,38418,148100,CHEMBL751582,129,CHEMBL233,Ki,=,7159.0,nM,5.14,...,6.3,True,,,,,test,train,,
9,41949,148100,CHEMBL751582,129,CHEMBL233,Ki,=,586.0,nM,6.23,...,6.3,True,,,,,train,test,,


In [83]:
# ==> updated `random_splitter` function
df_result = random_splitter(cura_df[:10], n_folds=2, aim='lo')
print(f'df_result.shape: {df_result.shape}')
print(f"df_result['activity_id']: \n{df_result['activity_id']}")
df_result

Random splitting completed.
df_result.shape: (10, 54)
df_result['activity_id']: 
0    32316
1    32321
2    34724
3    34729
4    35960
5    35965
6    37159
7    37164
8    38418
9    41949
Name: activity_id, dtype: int64


Unnamed: 0,activity_id,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,standard_relation,standard_value,standard_units,pchembl_value,...,lo_threshold,stereoSiblings,effect,assay,effect_description,assay_keywords_description,int.rmvStereo0_rs_lo_fold0,int.rmvStereo0_rs_lo_fold1,int.rmvStereo1_rs_lo_fold0,int.rmvStereo1_rs_lo_fold1
0,32316,148100,CHEMBL751582,129,CHEMBL233,Ki,=,10620.0,nM,4.97,...,6.3,False,,,,,train,test,test,train
1,32321,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1591.0,nM,5.8,...,6.3,True,,,,,test,train,,
2,34724,148100,CHEMBL751582,129,CHEMBL233,Ki,=,801.0,nM,6.1,...,6.3,True,,,,,train,test,,
3,34729,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1599.0,nM,5.8,...,6.3,False,,,,,test,train,test,train
4,35960,148100,CHEMBL751582,129,CHEMBL233,Ki,=,865.0,nM,6.06,...,6.3,True,,,,,train,test,,
5,35965,148100,CHEMBL751582,129,CHEMBL233,Ki,=,2176.0,nM,5.66,...,6.3,True,,,,,test,train,,
6,37159,148100,CHEMBL751582,129,CHEMBL233,Ki,=,2248.0,nM,5.65,...,6.3,False,,,,,train,test,train,test
7,37164,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1815.0,nM,5.74,...,6.3,True,,,,,test,train,,
8,38418,148100,CHEMBL751582,129,CHEMBL233,Ki,=,7159.0,nM,5.14,...,6.3,True,,,,,test,train,,
9,41949,148100,CHEMBL751582,129,CHEMBL233,Ki,=,586.0,nM,6.23,...,6.3,True,,,,,train,test,,


### aligned_split (train_folds, test_folds)

In [84]:
cf_df = df_result.copy()
col = 'int.rmvStereo1_rs_lo_fold0'
test_idx = cf_df.index[cf_df[col] == 'test'].tolist()
train_idx = cf_df.index[cf_df[col] == 'train'].tolist()

test_activity_ids = cf_df.loc[test_idx, 'activity_id'].tolist()
print(f'test_activity_ids: \n{test_activity_ids}')
train_activity_ids = cf_df.loc[train_idx, 'activity_id'].tolist()
print(f'train_activity_ids: \n{train_activity_ids}')

activity_list = [32316,
                 32321,
                 34724,
                 34729,
                 35960,
                 35965,
                 37159,
                 37164,
                 38418,
                 41949]

new_cols = {}
parent_col = 'int.rmvStereo1_rs_lo_fold0_check'
for id in activity_list:
    if id in test_activity_ids:
        new_cols.setdefault(parent_col, []).append('test')
    elif id in train_activity_ids:
        new_cols.setdefault(parent_col, []).append('train')
    else:
        new_cols.setdefault(parent_col, []).append(None)

new_cols

test_activity_ids: 
[32316, 34729]
train_activity_ids: 
[37159]


{'int.rmvStereo1_rs_lo_fold0_check': ['test',
  None,
  None,
  'test',
  None,
  None,
  'train',
  None,
  None,
  None]}

## random_split

In [26]:
# random split
test_folds = random_split(smiles, activity, n_folds=5, random_seed=RANDOM_SEED)
test_folds

[[2, 7, 9, 10, 14, 15, 30, 31, 33, 39, 40],
 [0, 5, 17, 23, 24, 25, 28, 35, 37, 45, 50],
 [4, 6, 12, 13, 26, 32, 34, 43, 44, 46, 52],
 [1, 11, 18, 20, 22, 29, 36, 42, 47, 49],
 [3, 8, 16, 19, 21, 27, 38, 41, 48, 51]]

In [62]:
for i in range(len(test_folds)):
    # add a new column f'rs_fold{i}' to df, the indices in test_folds[i] are marked as test, others are marked as train
    df[f'inner_rs_fold{i}'] = [0 if idx in test_folds[i] else 1 for idx in range(len(smiles))]

In [64]:
df[df['inner_rs_fold0']==0]

Unnamed: 0,activity_id,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,standard_relation,standard_value,standard_units,pchembl_value,...,assay,effect_description,assay_keywords_description,stereoSiblings,inner_chs_fold0,inner_rs_fold0,inner_rs_fold1,inner_rs_fold2,inner_rs_fold3,inner_rs_fold4
2,1951685,443966,CHEMBL892113,137,CHEMBL237,Ki,=,146.0,nM,6.84,...,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,test,0,1,1,1,1
7,1951690,443966,CHEMBL892113,137,CHEMBL237,Ki,=,215.0,nM,6.67,...,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,test,0,1,1,1,1
9,1951692,443966,CHEMBL892113,137,CHEMBL237,Ki,=,122.0,nM,6.91,...,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,test,0,1,1,1,1
10,1951693,443966,CHEMBL892113,137,CHEMBL237,Ki,=,255.0,nM,6.59,...,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,train,0,1,1,1,1
14,1951697,443966,CHEMBL892113,137,CHEMBL237,Ki,=,771.0,nM,6.11,...,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,train,0,1,1,1,1
15,1951698,443966,CHEMBL892113,137,CHEMBL237,Ki,=,116.0,nM,6.94,...,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,train,0,1,1,1,1
30,1951715,443966,CHEMBL892113,137,CHEMBL237,Ki,=,1801.0,nM,5.74,...,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,train,0,1,1,1,1
31,1951716,443966,CHEMBL892113,137,CHEMBL237,Ki,=,1438.0,nM,5.84,...,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,train,0,1,1,1,1
33,1951718,443966,CHEMBL892113,137,CHEMBL237,Ki,=,9221.0,nM,5.04,...,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,train,0,1,1,1,1
39,1951725,443966,CHEMBL892113,137,CHEMBL237,Ki,=,61045.0,nM,4.21,...,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,train,0,1,1,1,1


## cluster_aware_split (train_folds,test_folds)

In [17]:
# ==> 'cluster_aware_splitter' => rmvStereo0
df = cura_df[:20]
x_all = df['canonical_smiles_by_Std'].tolist()
selectionStrategy = 'clust_holdout'
sS = 'ch'

train_folds0, test_folds0 = cluster_aware_split(dist_type='substruct', selectionStrategy=selectionStrategy, x=x_all, 
                                              threshold=0.65, combineRandom=False, 
                                              random_seed=RANDOM_SEED, test_size=0.2, n_samples=2)
## check whether there are identical fold in 'test_folds'
tupled_test_folds = [tuple(sorted(fold)) for fold in test_folds0]
has_duplicates = len(tupled_test_folds) != len(set(tupled_test_folds))
print(f'has_duplicates in {sS}_test_folds: {has_duplicates}')

if has_duplicates:
    # get the unique folds
    test_folds0 = list(set(tupled_test_folds))
    print(f'Unique folds retained: {len(test_folds0)}')

# assign split to df
df_rmvStereo0 = add_fold_columns(df, prefix=f'int.rmvStereo0_{sS}', train_folds=train_folds0, test_folds=test_folds0)
print(f'df_rmvStereo0 shape: {df_rmvStereo0.shape}')
df_rmvStereo0

has_duplicates in ch_test_folds: True
Unique folds retained: 1
df_rmvStereo0 shape: (20, 51)


Unnamed: 0,activity_id,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,standard_relation,standard_value,standard_units,pchembl_value,...,vs_threshold,lo_activity_comment,lo_activity,lo_threshold,stereoSiblings,effect,assay,effect_description,assay_keywords_description,int.rmvStereo0_ch_fold0
0,32316,148100,CHEMBL751582,129,CHEMBL233,Ki,=,10620.0,nM,4.97,...,5.0,inactive,0.0,6.3,False,,,,,test
1,32321,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1591.0,nM,5.8,...,5.0,inactive,0.0,6.3,True,,,,,test
2,34724,148100,CHEMBL751582,129,CHEMBL233,Ki,=,801.0,nM,6.1,...,5.0,inactive,0.0,6.3,True,,,,,test
3,34729,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1599.0,nM,5.8,...,5.0,inactive,0.0,6.3,False,,,,,train
4,35960,148100,CHEMBL751582,129,CHEMBL233,Ki,=,865.0,nM,6.06,...,5.0,inactive,0.0,6.3,True,,,,,train
5,35965,148100,CHEMBL751582,129,CHEMBL233,Ki,=,2176.0,nM,5.66,...,5.0,inactive,0.0,6.3,True,,,,,train
6,37159,148100,CHEMBL751582,129,CHEMBL233,Ki,=,2248.0,nM,5.65,...,5.0,inactive,0.0,6.3,False,,,,,train
7,37164,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1815.0,nM,5.74,...,5.0,inactive,0.0,6.3,True,,,,,train
8,38418,148100,CHEMBL751582,129,CHEMBL233,Ki,=,7159.0,nM,5.14,...,5.0,inactive,0.0,6.3,True,,,,,train
9,41949,148100,CHEMBL751582,129,CHEMBL233,Ki,=,586.0,nM,6.23,...,5.0,weak inactive,0.0,6.3,True,,,,,train


In [18]:
# ==> 'cluster_aware_splitter' => rmvStereo1
df = cura_df[:20]

mask_no_stereo= df['stereoSiblings'] == False
df_sub = df.loc[mask_no_stereo].reset_index(drop=True)
x_sub = df_sub['canonical_smiles_by_Std'].tolist()

train_folds1, test_folds1 = cluster_aware_split(dist_type= 'substruct', selectionStrategy=selectionStrategy, x=x_sub,
                                    threshold=0.65, combineRandom=False, random_seed=RANDOM_SEED, test_size=0.2, n_samples=2)

## check whether there are identical fold in 'test_folds'
tupled_test_folds = [tuple(sorted(fold)) for fold in test_folds1]
has_duplicates = len(tupled_test_folds) != len(set(tupled_test_folds))
print(f'has_duplicates in {sS}_test_folds: {has_duplicates}')

if has_duplicates:
    # get the unique folds
    test_folds1 = list(set(tupled_test_folds))
    print(f'Unique folds retained: {len(test_folds1)}')

# assign split to df
df_rmvStereo1 = add_fold_columns(df_sub, prefix=f'int.rmvStereo1_{sS}', train_folds=train_folds1, test_folds=test_folds1)
print(f'df_rmvStereo1 shape: {df_rmvStereo1.shape}')
df_rmvStereo1

has_duplicates in ch_test_folds: True
Unique folds retained: 1
df_rmvStereo1 shape: (6, 51)


Unnamed: 0,activity_id,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,standard_relation,standard_value,standard_units,pchembl_value,...,vs_threshold,lo_activity_comment,lo_activity,lo_threshold,stereoSiblings,effect,assay,effect_description,assay_keywords_description,int.rmvStereo1_ch_fold0
0,32316,148100,CHEMBL751582,129,CHEMBL233,Ki,=,10620.0,nM,4.97,...,5.0,inactive,0.0,6.3,False,,,,,train
1,34729,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1599.0,nM,5.8,...,5.0,inactive,0.0,6.3,False,,,,,train
2,37159,148100,CHEMBL751582,129,CHEMBL233,Ki,=,2248.0,nM,5.65,...,5.0,inactive,0.0,6.3,False,,,,,train
3,47985,148100,CHEMBL751582,129,CHEMBL233,Ki,=,2148.0,nM,5.67,...,5.0,inactive,0.0,6.3,False,,,,,train
4,47990,148100,CHEMBL751582,129,CHEMBL233,Ki,=,4705.0,nM,5.33,...,5.0,inactive,0.0,6.3,False,,,,,train
5,49384,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1967.0,nM,5.71,...,5.0,inactive,0.0,6.3,False,,,,,test


In [19]:
# ==> 'cluster_aware_splitter' => merge results
df_merged = df_rmvStereo0.copy()
df_merged = df_merged.merge(
    df_rmvStereo1[['activity_id'] + [col for col in df_rmvStereo1.columns if col.__contains__(f"rmvStereo1_ch")]], 
    on='activity_id',
    how='left',
)
print(f'df_merged shape: {df_merged.shape}')
df_merged

df_merged shape: (20, 52)


Unnamed: 0,activity_id,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,standard_relation,standard_value,standard_units,pchembl_value,...,lo_activity_comment,lo_activity,lo_threshold,stereoSiblings,effect,assay,effect_description,assay_keywords_description,int.rmvStereo0_ch_fold0,int.rmvStereo1_ch_fold0
0,32316,148100,CHEMBL751582,129,CHEMBL233,Ki,=,10620.0,nM,4.97,...,inactive,0.0,6.3,False,,,,,test,train
1,32321,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1591.0,nM,5.8,...,inactive,0.0,6.3,True,,,,,test,
2,34724,148100,CHEMBL751582,129,CHEMBL233,Ki,=,801.0,nM,6.1,...,inactive,0.0,6.3,True,,,,,test,
3,34729,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1599.0,nM,5.8,...,inactive,0.0,6.3,False,,,,,train,train
4,35960,148100,CHEMBL751582,129,CHEMBL233,Ki,=,865.0,nM,6.06,...,inactive,0.0,6.3,True,,,,,train,
5,35965,148100,CHEMBL751582,129,CHEMBL233,Ki,=,2176.0,nM,5.66,...,inactive,0.0,6.3,True,,,,,train,
6,37159,148100,CHEMBL751582,129,CHEMBL233,Ki,=,2248.0,nM,5.65,...,inactive,0.0,6.3,False,,,,,train,train
7,37164,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1815.0,nM,5.74,...,inactive,0.0,6.3,True,,,,,train,
8,38418,148100,CHEMBL751582,129,CHEMBL233,Ki,=,7159.0,nM,5.14,...,inactive,0.0,6.3,True,,,,,train,
9,41949,148100,CHEMBL751582,129,CHEMBL233,Ki,=,586.0,nM,6.23,...,weak inactive,0.0,6.3,True,,,,,train,


In [20]:
# ==> updated 'cluster_aware_splitter' function
df_result = cluster_aware_splitter(cura_df[:20], selectionStrategy='clust_holdout')
print(f'df_result shape: {df_result.shape}')
df_result

int.rmvStereo0_ch: duplicate test folds detected. Keeping unique ones.
int.rmvStereo1_ch: duplicate test folds detected. Keeping unique ones.
Cluster-aware split completed for strategy 'clust_holdout'.
df_result shape: (20, 52)


Unnamed: 0,activity_id,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,standard_relation,standard_value,standard_units,pchembl_value,...,lo_activity_comment,lo_activity,lo_threshold,stereoSiblings,effect,assay,effect_description,assay_keywords_description,int.rmvStereo0_ch_fold0,int.rmvStereo1_ch_fold0
0,32316,148100,CHEMBL751582,129,CHEMBL233,Ki,=,10620.0,nM,4.97,...,inactive,0.0,6.3,False,,,,,test,train
1,32321,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1591.0,nM,5.8,...,inactive,0.0,6.3,True,,,,,test,
2,34724,148100,CHEMBL751582,129,CHEMBL233,Ki,=,801.0,nM,6.1,...,inactive,0.0,6.3,True,,,,,test,
3,34729,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1599.0,nM,5.8,...,inactive,0.0,6.3,False,,,,,train,train
4,35960,148100,CHEMBL751582,129,CHEMBL233,Ki,=,865.0,nM,6.06,...,inactive,0.0,6.3,True,,,,,train,
5,35965,148100,CHEMBL751582,129,CHEMBL233,Ki,=,2176.0,nM,5.66,...,inactive,0.0,6.3,True,,,,,train,
6,37159,148100,CHEMBL751582,129,CHEMBL233,Ki,=,2248.0,nM,5.65,...,inactive,0.0,6.3,False,,,,,train,train
7,37164,148100,CHEMBL751582,129,CHEMBL233,Ki,=,1815.0,nM,5.74,...,inactive,0.0,6.3,True,,,,,train,
8,38418,148100,CHEMBL751582,129,CHEMBL233,Ki,=,7159.0,nM,5.14,...,inactive,0.0,6.3,True,,,,,train,
9,41949,148100,CHEMBL751582,129,CHEMBL233,Ki,=,586.0,nM,6.23,...,weak inactive,0.0,6.3,True,,,,,train,


## cluster_stratified_split

In [34]:
#cluster_aware_split: clust_stratified
test_folds_2 = cluster_aware_split(dist_type= 'substruct', selectionStrategy='clust_stratified', x=smiles,
                                   threshold=0.65, combineRandom=False, random_seed=RANDOM_SEED, test_size=0.2, n_samples=5)
## check whether there are identical elements in 'test_folds_3'
tupled_folds_2 = [tuple(sorted(fold)) for fold in test_folds_2]
has_duplicates = len(tupled_folds_2) != len(set(tupled_folds_2))
print(f'has_duplicates in test_folds_2: {has_duplicates}')
test_folds_2

has_duplicates in test_folds_2: False


[[8, 22, 24, 2, 20, 37, 15, 39, 18, 10, 49],
 [3, 10, 22, 16, 11, 44, 46, 28, 8, 34, 2],
 [15, 28, 24, 48, 39, 25, 46, 13, 23, 45, 2],
 [40, 22, 25, 4, 19, 11, 44, 50, 35, 21, 17],
 [29, 6, 8, 13, 15, 21, 23, 27, 24, 32, 7]]

In [36]:
for i in range(len(test_folds_2)):
    # add a new column f'inner_css_fold{i}' to df, the indices in test_folds[i] are marked as test, others are marked as train
    df[f'inner_css_fold{i}'] = ['test' if idx in test_folds_2[i] else 'train' for idx in range(len(smiles))]

In [38]:
df[df['inner_css_fold0']=='test']

Unnamed: 0,activity_id,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,standard_relation,standard_value,standard_units,pchembl_value,...,inner_rs_fold0,inner_rs_fold1,inner_rs_fold2,inner_rs_fold3,inner_rs_fold4,inner_css_fold0,inner_css_fold1,inner_css_fold2,inner_css_fold3,inner_css_fold4
2,1951685,443966,CHEMBL892113,137,CHEMBL237,Ki,=,146.0,nM,6.84,...,test,train,train,train,train,test,test,test,train,train
8,1951691,443966,CHEMBL892113,137,CHEMBL237,Ki,=,174.0,nM,6.76,...,train,train,train,train,test,test,test,train,train,test
10,1951693,443966,CHEMBL892113,137,CHEMBL237,Ki,=,255.0,nM,6.59,...,test,train,train,train,train,test,test,train,train,train
15,1951698,443966,CHEMBL892113,137,CHEMBL237,Ki,=,116.0,nM,6.94,...,test,train,train,train,train,test,train,test,train,test
18,1951701,443966,CHEMBL892113,137,CHEMBL237,Ki,=,1056.0,nM,5.98,...,train,train,train,test,train,test,train,train,train,train
20,1951703,443966,CHEMBL892113,137,CHEMBL237,Ki,=,279.0,nM,6.55,...,train,train,train,test,train,test,train,train,train,train
22,1951705,443966,CHEMBL892113,137,CHEMBL237,Ki,=,725.0,nM,6.14,...,train,train,train,test,train,test,test,train,test,train
24,1951707,443966,CHEMBL892113,137,CHEMBL237,Ki,=,30.0,nM,7.52,...,train,test,train,train,train,test,train,test,train,test
37,1951722,443966,CHEMBL892113,137,CHEMBL237,Ki,=,26405.0,nM,4.58,...,train,test,train,train,train,test,train,train,train,train
39,1951725,443966,CHEMBL892113,137,CHEMBL237,Ki,=,61045.0,nM,4.21,...,test,train,train,train,train,test,train,test,train,train


## cluster_holdout_split

In [54]:
#cluster_aware_split: clust_holdout
test_folds_3 = cluster_aware_split(dist_type= 'substruct', selectionStrategy='clust_holdout', x=smiles,
                                   threshold=0.65, combineRandom=False, random_seed=RANDOM_SEED, test_size=0.2, n_samples=5)
## check whether there are identical elements in 'test_folds_3'
tupled_folds_3 = [tuple(sorted(fold)) for fold in test_folds_3]
has_duplicates = len(tupled_folds_3) != len(set(tupled_folds_3))
print(f'has_duplicates in test_folds_3: {has_duplicates}')
test_folds_3

has_duplicates in test_folds_3: True


[[38, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [38, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [38, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [38, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [38, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]

In [None]:
has_duplicates = len(tupled_folds_3) != len(set(tupled_folds_3))
print(f'test_folds_3 has duplicates: {has_duplicates}')
if has_duplicates:
    # get the unique folds
    unique_folds = list(set(tupled_folds_3))
    print(f'Number of unique folds in test_folds_3: {len(unique_folds)}')

unique_folds
for i in range(len(unique_folds)):
    # add a new column f'inner_chs_fold{i}' to df, the indices in unique_folds[i] are marked as test, others are marked as train
    df[f'inner_chs_fold{i}'] = ['test' if idx in unique_folds[i] else 'train' for idx in range(len(smiles))]

test_folds_3 has duplicates: True
Number of unique folds in test_folds_3: 1


In [61]:
df[df['inner_chs_fold0']=='test']

Unnamed: 0,activity_id,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,standard_relation,standard_value,standard_units,pchembl_value,...,vs_threshold,lo_activity_comment,lo_activity,lo_threshold,effect,assay,effect_description,assay_keywords_description,stereoSiblings,inner_chs_fold0
0,1951530,443966,CHEMBL892113,137,CHEMBL237,Ki,=,364.0,nM,6.44,...,5.9694,active,1.0,5.9694,bind,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,test
1,1951684,443966,CHEMBL892113,137,CHEMBL237,Ki,=,186.0,nM,6.73,...,5.9694,active,1.0,5.9694,bind,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,test
2,1951685,443966,CHEMBL892113,137,CHEMBL237,Ki,=,146.0,nM,6.84,...,5.9694,active,1.0,5.9694,bind,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,test
3,1951686,443966,CHEMBL892113,137,CHEMBL237,Ki,=,139.0,nM,6.86,...,5.9694,active,1.0,5.9694,bind,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,test
4,1951687,443966,CHEMBL892113,137,CHEMBL237,Ki,=,261.0,nM,6.58,...,5.9694,active,1.0,5.9694,bind,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,test
5,1951688,443966,CHEMBL892113,137,CHEMBL237,Ki,=,323.0,nM,6.49,...,5.9694,active,1.0,5.9694,bind,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,test
6,1951689,443966,CHEMBL892113,137,CHEMBL237,Ki,=,330.0,nM,6.48,...,5.9694,active,1.0,5.9694,bind,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,test
7,1951690,443966,CHEMBL892113,137,CHEMBL237,Ki,=,215.0,nM,6.67,...,5.9694,active,1.0,5.9694,bind,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,test
8,1951691,443966,CHEMBL892113,137,CHEMBL237,Ki,=,174.0,nM,6.76,...,5.9694,active,1.0,5.9694,bind,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,test
9,1951692,443966,CHEMBL892113,137,CHEMBL237,Ki,=,122.0,nM,6.91,...,5.9694,active,1.0,5.9694,bind,RBA,binding affinity,Receptor binding assay: radioligand binding assay,False,test


## Utility functions for splitting

In [None]:
def read_df(in_path: str, file: str, aim: str = 'lo', remove_stereo: bool = True):
    """
    """
    df = pd.read_csv(os.path.join(in_path, file))

    smiles = df['canonical_smiles_by_Std'].tolist()
    if aim == 'vs':
        activity = df['vs_activity'].tolist()
    elif aim == 'lo':
        activity = df['lo_activity'].tolist()
    activity_id = df['activity_id'].tolist()

    if remove_stereo:
        stereo_smiles = find_stereochemical_siblings(smiles)
        stereo_smiles_idx = [i for i, smi in enumerate(smiles) if smi in stereo_smiles]

        if len(stereo_smiles_idx) == 0:
            print('No stereoisomers found')
        else:
            print(f'Removed {len(stereo_smiles_idx)} stereoisomers, and the idx are {stereo_smiles_idx}')
            smiles = [smi for i, smi in enumerate(smiles) if i not in stereo_smiles_idx]
            activity = [act for i, act in enumerate(activity) if i not in stereo_smiles_idx]
            activity_id = [act_id for i, act_id in enumerate(activity_id) if i not in stereo_smiles_idx]
            
            df = df.drop(index=stereo_smiles_idx).reset_index(drop=True)

    return df, smiles, activity, activity_id

def assign_split(n_folds: int = 5):
    
    sub_sups_dict = {
        CURA_LHD_OR_DIR: (CURA_MHD_OR_DIR,CURA_MHD_OR_effect_DIR, CURA_HHD_OR_DIR),
        #CURA_MHD_OR_DIR: (CURA_MHD_OR_effect_DIR, CURA_HHD_OR_DIR),
    }

    cura_split_dict = {
        CURA_LHD_OR_DIR: SPLIT_LHD_OR_DIR,
        CURA_MHD_OR_DIR: SPLIT_MHD_OR_DIR,
        CURA_MHD_OR_effect_DIR: SPLIT_MHD_OR_effect_DIR,
        CURA_HHD_OR_DIR: SPLIT_HHD_OR_DIR,
    }

    for sub, sups in sub_sups_dict.items():
        #print(f'Subset: {sub}')
        for sub_f in os.listdir(sub):
            print(f'sub_f is {sub_f}')
            if sub == CURA_LHD_OR_DIR:
                target, assay, effect, standard_type, assay_chembl_id = sub_f.split('_')[:5]
            elif sub == CURA_MHD_OR_DIR:
                target, assay, effect, standard_type = sub_f.split('_')[:4]

            sub_df, smiles, activity, activity_id = read_df(sub, sub_f, aim='lo', remove_stereo=True)
            fold_train_act_id, fold_test_act_id = random_split(smiles, activity, activity_id, n_folds=5, test_size=0.2)

            for sup in sups:
                if sup == CURA_MHD_OR_DIR:
                    sup_base_name = f'{target}_{assay}_{effect}_{standard_type}'
                elif sup == CURA_MHD_OR_effect_DIR:
                    sup_base_name = f'{target}_{effect}'
                elif sup == CURA_HHD_OR_DIR:
                    sup_base_name = f'{target}_{standard_type}'
                
                sup_fs = [f for f in os.listdir(sup) if f.startswith(sup_base_name)]
                print (f'sup_fs found: {sup_fs}')
                if len(sup_fs) == 1:
                    sup_f = sup_fs[0]
                elif len(sup_fs) == 0:
                    print(f'No sup_f found for {sup_base_name} in {sup}')
                    continue
                else:
                    print(f'More than one sup_f found for {sup_base_name} in {sup}, please check!')
                    continue
                print(f'sup_f found: {sup_f}')
                sup_df, _, _, _ = read_df(sup, sup_f, aim='lo', remove_stereo=True)

                for i in range(n_folds):
                    sub_df[f'fold{i}'] = ['test' if act_id in fold_test_act_id[f'fold_{i}'] else 'train' for act_id in sub_df['activity_id']]
                    sup_df[f'{sub_f}:fold{i}'] = ['test' if act_id in fold_test_act_id[f'fold_{i}'] else 'train' for act_id in sup_df['activity_id']]

                mkdirs(cura_split_dict[sub])
                mkdirs(cura_split_dict[sup])
                sub_df.to_csv(os.path.join(cura_split_dict[sub], sub_f), index=False)
                sup_df.to_csv(os.path.join(cura_split_dict[sup], sup_f), index=False)

            print('----------------------------------')
assign_split()

sub_f is CHEMBL233_bind_RBA_Ki_CHEMBL3887789_lhd_b50_curated.csv


100%|██████████| 90/90 [00:00<00:00, 11648.69it/s]

Removed 2 stereoisomers, and the idx are [32, 33]
sup_fs found: ['CHEMBL233_bind_RBA_Ki_mhd_b50_curated.csv']
sup_f found: CHEMBL233_bind_RBA_Ki_mhd_b50_curated.csv



100%|██████████| 4472/4472 [00:19<00:00, 233.32it/s] 


Removed 633 stereoisomers, and the idx are [1, 3, 6, 7, 10, 11, 12, 13, 14, 15, 16, 20, 21, 22, 25, 31, 33, 34, 35, 37, 39, 40, 43, 45, 46, 51, 53, 59, 66, 76, 79, 80, 87, 88, 92, 97, 99, 100, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 186, 188, 192, 194, 196, 202, 204, 207, 212, 214, 216, 218, 223, 227, 228, 229, 232, 233, 239, 240, 241, 244, 257, 268, 269, 278, 283, 291, 293, 348, 351, 387, 388, 389, 391, 394, 397, 398, 399, 401, 405, 412, 414, 416, 434, 438, 441, 445, 452, 457, 459, 462, 465, 470, 475, 479, 481, 482, 483, 493, 498, 501, 522, 523, 524, 526, 527, 528, 529, 530, 531, 532, 533, 534, 536, 537, 538, 539, 540, 541, 542, 545, 546, 547, 548, 549, 612, 645, 647, 655, 656, 668, 703, 731, 739, 740, 744, 745, 746, 814, 847, 850, 852, 853, 854, 885, 886, 889, 890, 891, 927, 928, 929, 930, 931, 932, 933, 934, 935, 952, 1019, 1020, 1021, 1022, 1023, 1024, 1028, 1029, 1030, 1031, 1033, 1035, 1037, 1038, 1039, 1040, 1

100%|██████████| 4535/4535 [00:19<00:00, 232.91it/s]


Removed 647 stereoisomers, and the idx are [1, 3, 6, 7, 10, 11, 12, 13, 14, 15, 16, 20, 21, 22, 25, 31, 33, 34, 35, 37, 39, 40, 43, 45, 46, 51, 53, 59, 66, 76, 79, 80, 87, 88, 92, 97, 99, 100, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 178, 187, 189, 193, 195, 197, 203, 205, 208, 213, 215, 217, 219, 224, 228, 229, 230, 233, 234, 240, 241, 242, 245, 259, 270, 271, 280, 285, 293, 295, 359, 362, 424, 425, 426, 428, 431, 434, 435, 436, 438, 442, 449, 451, 453, 471, 475, 478, 482, 489, 496, 498, 502, 507, 515, 523, 527, 529, 530, 531, 541, 546, 549, 570, 571, 572, 574, 575, 576, 577, 578, 579, 580, 581, 582, 584, 585, 586, 587, 589, 590, 591, 594, 595, 596, 597, 598, 661, 692, 694, 702, 703, 715, 744, 745, 751, 752, 753, 754, 755, 756, 767, 795, 803, 804, 808, 809, 810, 878, 911, 914, 916, 917, 918, 949, 950, 953, 954, 955, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1016, 1083, 1084, 1085, 1086, 1087, 1088, 1092, 1093, 109

100%|██████████| 68/68 [00:00<00:00, 15592.21it/s]

No stereoisomers found





ValueError: The target variable 'activity' must contain at least two classes for stratified splitting.

## neighbor_split

In [None]:
# def clusterData
dmat = get_substructure_matrix(smiles)
dmat

100%|██████████| 53/53 [00:00<00:00, 19428.26it/s]


array([[0.        , 0.30645161, 0.31666667, ..., 0.79411765, 0.75      ,
        0.64864865],
       [0.30645161, 0.        , 0.8245614 , ..., 0.28358209, 0.29411765,
        0.34375   ],
       [0.31666667, 0.8245614 , 0.        , ..., 0.29230769, 0.3030303 ,
        0.35483871],
       ...,
       [0.79411765, 0.28358209, 0.29230769, ..., 0.        , 0.74358974,
        0.65      ],
       [0.75      , 0.29411765, 0.3030303 , ..., 0.74358974, 0.        ,
        0.61904762],
       [0.64864865, 0.34375   , 0.35483871, ..., 0.65      , 0.61904762,
        0.        ]])

In [None]:
threshold = 0.3
#clusterSizeThreshold = max(5, len(dmat)/50)
clusterSizeThreshold = 2
print(f'Cluster size threshold is set to {clusterSizeThreshold}')
combineRandom=False

nfps = len(smiles)
print(f'Number of fingerprints: {nfps}')
symmDmat = []
for i in range(1, nfps):
    symmDmat.extend(dmat[i, :i]) # the list of values in the lower triangle of the distance matrix, excluding the diagonal.
print(f'The length of symmDmat is {len(symmDmat)}')
symmDmat

Cluster size threshold is set to 2
Number of fingerprints: 53
The length of symmDmat is 1378


[0.3064516129032258,
 0.31666666666666665,
 0.8245614035087719,
 0.34545454545454546,
 0.7368421052631579,
 0.7636363636363637,
 0.34545454545454546,
 0.6779661016949152,
 0.7017543859649122,
 0.7692307692307693,
 0.34545454545454546,
 0.6779661016949152,
 0.7017543859649122,
 0.7692307692307693,
 0.7692307692307693,
 0.3275862068965517,
 0.59375,
 0.639344262295082,
 0.6379310344827587,
 0.6101694915254238,
 0.6101694915254238,
 0.3333333333333333,
 0.6031746031746031,
 0.6229508196721312,
 0.6491228070175439,
 0.6206896551724138,
 0.6206896551724138,
 0.8301886792452831,
 0.3220338983050847,
 0.6885245901639344,
 0.6557377049180327,
 0.6271186440677966,
 0.6,
 0.6,
 0.6779661016949152,
 0.6896551724137931,
 0.31666666666666665,
 0.6507936507936508,
 0.7,
 0.6440677966101694,
 0.5901639344262295,
 0.5901639344262295,
 0.6949152542372882,
 0.6779661016949152,
 0.7719298245614035,
 0.36538461538461536,
 0.6271186440677966,
 0.6491228070175439,
 0.7115384615384616,
 0.6792452830188679,
 

In [None]:
cs = Butina.ClusterData(symmDmat, nfps, threshold, isDistData=True, reordering=True)
cs

((11,
  0,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  39,
  40,
  41,
  43,
  44,
  45,
  46,
  48,
  49,
  50,
  51,
  52),
 (42, 12),
 (47,),
 (38,),
 (25,),
 (24,),
 (23,),
 (22,),
 (21,),
 (20,),
 (19,),
 (18,),
 (17,),
 (16,),
 (15,),
 (14,),
 (13,),
 (10,),
 (9,),
 (8,),
 (7,),
 (6,),
 (5,),
 (4,),
 (3,),
 (2,),
 (1,))

In [None]:
cs = sorted(cs, key=lambda x: len(x), reverse=True)
cs

[(11,
  0,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  39,
  40,
  41,
  43,
  44,
  45,
  46,
  48,
  49,
  50,
  51,
  52),
 (42, 12),
 (47,),
 (38,),
 (25,),
 (24,),
 (23,),
 (22,),
 (21,),
 (20,),
 (19,),
 (18,),
 (17,),
 (16,),
 (15,),
 (14,),
 (13,),
 (10,),
 (9,),
 (8,),
 (7,),
 (6,),
 (5,),
 (4,),
 (3,),
 (2,),
 (1,)]

In [None]:
# start with the large clusters
largeClusters = [list(c) for c in cs if len(c) >= clusterSizeThreshold]
largeClusters

[[11,
  0,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  39,
  40,
  41,
  43,
  44,
  45,
  46,
  48,
  49,
  50,
  51,
  52],
 [42, 12]]

In [None]:
if not largeClusters:
    raise ValueError("no clusters found")
if combineRandom:
    tmpCluster = []
    for c in cs:
        if len(c) >= clusterSizeThreshold:
            continue
        tmpCluster.extend(c)
        if len(tmpCluster) >= clusterSizeThreshold:
            random.shuffle(tmpCluster)
            largeClusters.append(tmpCluster)
            tmpCluster = []
    if tmpCluster:
        largeClusters.append(tmpCluster)
else:
    # add points from small cluster to the nearest large cluster
    # nearest is defined by the nearest neighbor in that cluster
    print(f'cs is {cs}')
    for c in cs:
        print(f'c: {c}')
        if len(c) >= clusterSizeThreshold:
            continue
        for idx in c:
            print(f'idx: {idx}')
            closest = -1
            minD = 1e5
            print(f'minD is {minD}')
            for cidx, clust in enumerate(largeClusters):
                print(f'cidx: {cidx}, clust: {clust}')
                for elem in clust:
                    print(f'idx, elem: {idx}, {elem}')
                    d = dmat[idx, elem]
                    print(f'd: {d}')
                    if d < minD:
                        closest = cidx
                        minD = d
            assert closest > -1
            print(f'closest is {closest}')
            print(f'minD is {minD}')
            largeClusters[closest].append(idx)
            print(f'largeClusters is now {largeClusters}')
            print('=====================')

print(f'largeClusters: {largeClusters}')

cs is [(11, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40, 41, 43, 44, 45, 46, 48, 49, 50, 51, 52), (42, 12), (47,), (38,), (25,), (24,), (23,), (22,), (21,), (20,), (19,), (18,), (17,), (16,), (15,), (14,), (13,), (10,), (9,), (8,), (7,), (6,), (5,), (4,), (3,), (2,), (1,)]
c: (11, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40, 41, 43, 44, 45, 46, 48, 49, 50, 51, 52)
c: (42, 12)
c: (47,)
idx: 47
minD is 100000.0
cidx: 0, clust: [11, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40, 41, 43, 44, 45, 46, 48, 49, 50, 51, 52]
idx, elem: 47, 11
d: 0.32142857142857145
idx, elem: 47, 0
d: 0.6486486486486487
idx, elem: 47, 26
d: 0.5714285714285714
idx, elem: 47, 27
d: 0.627906976744186
idx, elem: 47, 28
d: 0.5714285714285714
idx, elem: 47, 29
d: 0.5581395348837209
idx, elem: 47, 30
d: 0.4782608695652174
idx, elem: 47, 31
d: 0.5238095238095238
idx, elem: 47, 32
d: 0.5
idx, elem: 47, 33
d: 0.5
idx, elem: 47, 34
d: 0.5897435897435898
idx, elem: 47, 35
d: 0.5227272

In [None]:
# def assignUsingClusters
nSamples=4

class SelectionStrategy:
    DIVERSE_SPLIT = 1
    CLUSTERS_SPLIT = 2
selectionStrategy = SelectionStrategy.CLUSTERS_SPLIT
print(f'Selection strategy is {selectionStrategy}')
test_size = 0.2
nTest= round(len(smiles)*test_size)
print(f'Number of test samples: {nTest}')

random.seed(RANDOM_SEED)
res = []
for i in range(nSamples):
    print(f'Iteration {i}')
    if selectionStrategy == SelectionStrategy.DIVERSE_SPLIT: 
    # The test set has the similar distribution of clusters as the training set
        ordered = []
        for c in largeClusters:
            print(f'c before shuffle: {c}')
            random.shuffle(c)
            print(f'c after shuffle: {c}')
            ordered.extend((i / len(c), x) for i, x in enumerate(c))
            print(f'ordered in: {ordered}')
            print('-------------------')

        print(f'ordered before sort: {ordered}')
        ordered = [y for x, y in sorted(ordered)]
        print(f'ordered after sort: {ordered}')
        test=ordered[:nTest]
        print(f'test: {test}')
    elif selectionStrategy == SelectionStrategy.CLUSTERS_SPLIT:
        # The test set has the different distribution of clusters as the training set
        random.shuffle(largeClusters)
        print(f'largeClusters after shuffle: {largeClusters}')
        test = []
        for c in largeClusters:
            nRequired = nTest - len(test)
            print(f'nRequired: {nRequired}, len(test): {len(test)}')
            test.extend(c[:nRequired])
            if len(test) >= nTest:
                break
        print(f'test: {test}')
    
    res.append(test)
    print(f'res: {res}')
    print('----------------------------------')

print(f'Final result: {res}')

Selection strategy is 2
Number of test samples: 11
Iteration 0
largeClusters after shuffle: [[18, 21, 51, 29, 28, 32, 13, 0, 19, 50, 31, 5, 36, 10, 22, 8, 2, 48, 14, 9, 24, 40, 45, 16, 44, 37, 15, 38, 26, 6, 27, 33, 20, 11, 1, 25, 41, 43, 30, 4, 7, 17, 39, 49, 52, 23, 35, 34, 3, 46], [42, 12, 47]]
nRequired: 11, len(test): 0
test: [18, 21, 51, 29, 28, 32, 13, 0, 19, 50, 31]
res: [[18, 21, 51, 29, 28, 32, 13, 0, 19, 50, 31]]
----------------------------------
Iteration 1
largeClusters after shuffle: [[42, 12, 47], [18, 21, 51, 29, 28, 32, 13, 0, 19, 50, 31, 5, 36, 10, 22, 8, 2, 48, 14, 9, 24, 40, 45, 16, 44, 37, 15, 38, 26, 6, 27, 33, 20, 11, 1, 25, 41, 43, 30, 4, 7, 17, 39, 49, 52, 23, 35, 34, 3, 46]]
nRequired: 11, len(test): 0
nRequired: 8, len(test): 3
test: [42, 12, 47, 18, 21, 51, 29, 28, 32, 13, 0]
res: [[18, 21, 51, 29, 28, 32, 13, 0, 19, 50, 31], [42, 12, 47, 18, 21, 51, 29, 28, 32, 13, 0]]
----------------------------------
Iteration 2
largeClusters after shuffle: [[42, 12, 47

In [None]:
# Random split
# neighbor split: train_test_diff
#                 train_test_similar
# inner_split: lhd_or, mhd_or, mhd_or_effect
# outer_split: 
#               based on lhd_or: mhd_or, mhd_or_effect, hhd_or
#               based on mhd_or: mhd_or_effect, hhd_or
#               based on mhd_or_effect: hhd_or

# cross-validation for hyperparameter tuning
# nested cross-validation for model evaluation


## K-fold CV for cluster-aware splitting

In [21]:
largeClusters=[[11,0,26,27,28,29,30,31,32,33,34,35,36,37,39,40,41,43,44,45,46,48,49,50,51,52],[42, 12]]
random_seed = 42
n_folds =5
selectionStrategy = 'clust_stratified'
selectionStrategy = 'clust_holdout'

x = len(largeClusters[0]) + len(largeClusters[1])
n_mols = x
print(f'Number of molecules: {n_mols}')
test_folds = []
train_folds = []

if selectionStrategy == 'clust_stratified':
    # Each cluster contributes some members to each fold
    fold_assignments = [[] for _ in range(n_folds)]

    for cluster in largeClusters:
        random.shuffle(cluster)
        # Split cluster into roughly equal parts across folds
        splits = np.array_split(cluster, n_folds)
        for i in range(n_folds):
            fold_assignments[i].extend(splits[i])

    # Construct train/test splits
    for i in range(n_folds):
        test_idx = fold_assignments[i]
        train_idx = [idx for j, f in enumerate(fold_assignments) if j != i for idx in f]
        test_folds.append(np.array(test_idx))
        train_folds.append(np.array(train_idx))

elif selectionStrategy == 'clust_holdout':
    n_clusters = len(largeClusters)
    if n_clusters < n_folds:
        # Adjust n_folds down (Safest)
        print(f"Warning: Reducing n_folds from {n_folds} to {n_clusters} due to low cluster count.")
        n_folds = n_clusters

    # Shuffle clusters (use local Random for reproducibility without changing global RNG)
    rng = random.Random(random_seed)
    rng.shuffle(largeClusters)

    # Greedy assignment: assign each cluster to the fold with the smallest current size
    fold_assignments = [[] for _ in range(n_folds)]
    fold_sizes = [0] * n_folds  # number of molecules in each fold

    for cluster in largeClusters:
        # choose fold with minimum size (tie-breaker: lowest index)
        target_fold = int(np.argmin(fold_sizes))
        fold_assignments[target_fold].extend(cluster)
        fold_sizes[target_fold] += len(cluster)
        
    # Construct train/test splits (ensure integer dtype)
    for i in range(n_folds):
        test_idx = np.array(fold_assignments[i], dtype=int)
        train_idx = np.array([idx for j, f in enumerate(fold_assignments) if j != i for idx in f], dtype=int)
        test_folds.append(test_idx)
        train_folds.append(train_idx)


Number of molecules: 28


In [22]:
test_folds

[array([42, 12]),
 array([11,  0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40, 41,
        43, 44, 45, 46, 48, 49, 50, 51, 52])]

In [23]:
train_folds

[array([11,  0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40, 41,
        43, 44, 45, 46, 48, 49, 50, 51, 52]),
 array([42, 12])]

In [34]:
fold_sizes = [[25, 2]]
fold_argmin = np.argmin(fold_sizes)
fold_argmin
    

1

# assay_wise_split on MHDs and HHDs