In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from extendtr.TR.topoReg import TopoReg
from extendtr.utils.args import TopoRegArgs
from extendtr.utils.utils import set_seed, metric_calc

## Prepare the data

In [2]:
# load the descriptor - the indices of desc will be used later
desc = pd.read_parquet(f'./SampleDatasets/CHEMBL278/data_ECFP4.parquet', engine='fastparquet').astype('bool')
# load targets
data = pd.read_csv(f'./SampleDatasets/CHEMBL278/data_cp.csv', index_col=0)
target = data["pChEMBL Value"]
# make sure that the indices of desc and target match
desc = desc.loc[target.index]
target = target.loc[desc.index]

# load indices for scaffold split
with open(f'./SampleDatasets/CHEMBL278/scaffold_split_index.json', 'r') as f:
    index = json.load(f)  
train_idx = index['train_idx']
test_idx = index['test_idx']
# make sure that train and test indices are included in target.index
train_idx = [idx for idx in train_idx if idx in target.index]
test_idx = [idx for idx in test_idx if idx in target.index]

##### alternatively, you can randomly split train and test idx
# dataset_idx = target.index.tolist()
# train_idx, test_idx = train_test_split(dataset_idx, test_size=0.2, random_state=args.seed)

# set validation index if necessary
val_set = 0.2 # a fraction number to use [val_set] percent samples from the train set as val set, or None for no validation
if val_set is not None: # if we want to test on the validation set
    train_idx, val_idx = train_test_split(train_idx, test_size=val_set, random_state=2021)
else: # no validation
    val_idx = None

## Run ensemble TR with ECFP4

In [3]:
# get the args
args = TopoRegArgs('-ensemble 1') # ensemble TR
# set random seed
set_seed(args.seed)

# train and get the predictions
mdl, pred_test, pred_val, train_time, test_time = TopoReg(desc, target, train_idx, test_idx, val_idx, args)

# evaluate the resuls
scorr, r2, rmse, nrmse = metric_calc(pred_test, target.loc[test_idx], True)

Spearman: 0.8288945395492194
R2: 0.8700309651617721
RMSE: 0.6685660816936404
NRMSE: 0.36051218403575186


## Run ensemble TR with ANN

In [3]:
# get the args
args = TopoRegArgs('-ensemble 1 -distance jaccard -model ANN -ann_cp_dir ./results/ann_cp/') # ensemble TR, Jaccard distance, ANN model, 0.2 validation set, dir to save the checkpoints
# set random seed
set_seed(args.seed)
# train and get the predictions
mdl, pred_test, pred_val, train_time, test_time = TopoReg(desc, target, train_idx, test_idx, val_idx, args)
# evaluate the resuls
args.verbose = 1 # to report the metrics
scorr, r2, rmse, nrmse = metric_calc(pred_test, target.loc[test_idx], True)

Spearman: 0.7776435139782599
R2: 0.5956033004081223
RMSE: 1.179310578285253
NRMSE: 0.6359219288496644


## Run Mordred descriptor with Euclidean distance

In [5]:
desc = pd.read_parquet(f'./SampleDatasets/CHEMBL278/data_Mordred.parquet', engine='fastparquet')
desc = desc.loc[target.index]
# get the args
args = TopoRegArgs('-ensemble 1 -distance euclidean -desc_norm 1') # ensemble TR, euclidean distance, normalize the descriptors before calculate the distances
# set random seed
set_seed(args.seed)

# train and get the predictions
mdl, pred_test, pred_val, train_time, test_time = TopoReg(desc, target, train_idx, test_idx, val_idx, args)

# evaluate the resuls
scorr, r2, rmse, nrmse = metric_calc(pred_test, target.loc[test_idx], True)

Spearman: 0.8056903350584307
R2: 0.8766784401382622
RMSE: 0.6512442435883308
NRMSE: 0.35117169570131607


## Combine predictions from various configurations

### prepare data

In [6]:
# load the descriptor - the indices of desc will be used later
desc = pd.read_parquet(f'./SampleDatasets/CHEMBL278/data_ECFP4.parquet', engine='fastparquet').astype('bool')
# load targets
data = pd.read_csv(f'./SampleDatasets/CHEMBL278/data_cp.csv', index_col=0)
target = data["pChEMBL Value"]
# make sure that the indices of desc and target match
desc = desc.loc[target.index]
target = target.loc[desc.index]

# load indices for scaffold split
with open(f'./SampleDatasets/CHEMBL278/scaffold_split_index.json', 'r') as f:
    index = json.load(f)  
train_idx = index['train_idx']
test_idx = index['test_idx']
# make sure that train and test indices are included in target.index
train_idx = [idx for idx in train_idx if idx in target.index]
test_idx = [idx for idx in test_idx if idx in target.index]

##### alternatively, you can randomly split train and test idx
# dataset_idx = target.index.tolist()
# train_idx, test_idx = train_test_split(dataset_idx, test_size=0.2, random_state=args.seed)

# set validation index if necessary
val_set = 0.2 # a fraction number to use [val_set] percent samples from the train set as val set, or None for no validation
if val_set is not None: # if we want to test on the validation set
    train_idx, val_idx = train_test_split(train_idx, test_size=val_set, random_state=2021)
else: # no validation
    val_idx = None

### Different descriptors

In [8]:
from extendtr.utils.utils import stack_models
# define arg strs for different anchor selection methods
args = TopoRegArgs(f'-ensemble 1')
descriptors = ['ECFP4', 'ECFP6', 'Mordred', 'RDKdesc', 'tcnn_zpad']
data_path = './SampleDatasets/CHEMBL278/'

########################### Train and get the prediction ###############################
preds_test = []
preds_val = []
for descriptor in descriptors:
    # Load descriptors
    if descriptor in ['ECFP4', 'ECFP6']:
        desc = pd.read_parquet(f'{data_path}/data_{descriptor}.parquet', engine='fastparquet').astype('bool')
    elif descriptor in ['Mordred', 'RDKdesc']:
        desc = pd.read_parquet(f'{data_path}/data_{descriptor}.parquet', engine='fastparquet')
    elif descriptor == 'tcnn_zpad':
        desc = pd.read_csv(f"{data_path}/tcnn_embeddings_zpad.csv", index_col=0)
    
    if descriptor in ['ECFP4', 'ECFP6']:
        args.distance = 'jaccard'
        args.desc_norm = False
    else:
        args.distance = 'euclidean'
        args.desc_norm = True

    # set random seed
    set_seed(args.seed)
    mdl, pred_test, pred_val, train_time, test_time = TopoReg(desc, target, train_idx, test_idx, val_idx, args)
    # Stack the predicted responses for test and validation sets
    preds_test.append(pred_test)
    preds_val.append(pred_val)

########################### Combine the results ###############################
# ensemble
pred_test = np.array(preds_test).mean(axis=0)
# evaluation
print('Performance of ensemble predictions:')
scorr, r2, rmse, nrmse = metric_calc(pred_test, target.loc[test_idx], True)

# stacking the results
pred_test, train_time, test_time = stack_models(preds_val, preds_test, target, val_idx)
# evaluation
print('Performance of stacking predictions:')
scorr, r2, rmse, nrmse = metric_calc(pred_test, target.loc[test_idx], True)

Performance of ensemble predictions:
Spearman: 0.8079098676618973
R2: 0.8286059578308224
RMSE: 0.7677542929995983
NRMSE: 0.4139976354632688
Performance of stacking predictions:
Spearman: 0.7691689349468414
R2: 0.8473305802385793
RMSE: 0.7246034882029492
NRMSE: 0.39072934335857173


### Different anchor selection methods

In [9]:
from extendtr.utils.utils import stack_models
# load the descriptor - the indices of desc will be used later
desc = pd.read_parquet(f'./SampleDatasets/CHEMBL278/data_ECFP4.parquet', engine='fastparquet').astype('bool')
# define arg strs for different anchor selection methods
arg_strs_anchor = [f'-ensemble 1', 
                f'-anchorselection maximin -ensemble 1 -mean_anchor_percentage 0.4 -min_anchor_percentage 0.2 -max_anchor_percentage 0.6',
                f'-refine_anchors_lasso 1 -anchor_percentage 0.8', 
                f'-anchorselection maximin_density -weight_density 0.5 -check_duplicates 1'
                ]


########################### Train and get the prediction ###############################
preds_test = []
preds_val = []
for arg_str in arg_strs_anchor:
    args = TopoRegArgs(f'{arg_str}')
    # set random seed
    set_seed(args.seed)
    mdl, pred_test, pred_val, train_time, test_time = TopoReg(desc, target, train_idx, test_idx, val_idx, args)
    # Stack the predicted responses for test and validation sets
    preds_test.append(pred_test)
    preds_val.append(pred_val)

########################### Combine the results ###############################
# ensemble
pred_test = np.array(preds_test).mean(axis=0)
# evaluation
print('Performance of ensemble predictions:')
scorr, r2, rmse, nrmse = metric_calc(pred_test, target.loc[test_idx], True)


# stacking the results
pred_test, train_time, test_time = stack_models(preds_val, preds_test, target, val_idx)
# evaluation
print('Performance of stacking predictions:')
scorr, r2, rmse, nrmse = metric_calc(pred_test, target.loc[test_idx], True)

Performance of ensemble predictions:
Spearman: 0.8117436057951581
R2: 0.8587051812567446
RMSE: 0.6970878467718723
NRMSE: 0.37589203069931587
Performance of stacking predictions:
Spearman: 0.8407993053314502
R2: 0.8689782680175232
RMSE: 0.6712681800377205
NRMSE: 0.36196924176299394


### Different distances

In [10]:
from extendtr.utils.utils import stack_models
# load the descriptor - the indices of desc will be used later
desc = pd.read_parquet(f'./SampleDatasets/CHEMBL278/data_ECFP4.parquet', engine='fastparquet').astype('bool')
# define arg strs for different anchor selection methods
arg_strs_dist = [f'-ensemble 1', 
                f'-distance tversky -ensemble 1',
                f'-distance euclidean -ensemble 1', 
                f'-distance cosine -ensemble 1'
                ]


########################### Train and get the prediction ###############################
preds_test = []
preds_val = []
for arg_str in arg_strs_dist:
    args = TopoRegArgs(f'{arg_str}')
    # set random seed
    set_seed(args.seed)
    mdl, pred_test, pred_val, train_time, test_time = TopoReg(desc, target, train_idx, test_idx, val_idx, args)
    # Stack the predicted responses for test and validation sets
    preds_test.append(pred_test)
    preds_val.append(pred_val)

########################### Combine the results ###############################
# ensemble
pred_test = np.array(preds_test).mean(axis=0)
# evaluation
print('Performance of ensemble predictions:')
scorr, r2, rmse, nrmse = metric_calc(pred_test, target.loc[test_idx], True)


# stacking the results
pred_test, train_time, test_time = stack_models(preds_val, preds_test, target, val_idx)
# evaluation
print('Performance of stacking predictions:')
scorr, r2, rmse, nrmse = metric_calc(pred_test, target.loc[test_idx], True)

Performance of ensemble predictions:
Spearman: 0.8250608014159587
R2: 0.8581550874070674
RMSE: 0.6984434917870883
NRMSE: 0.3766230377883603
Performance of stacking predictions:
Spearman: 0.8627928556748934
R2: 0.9018851154434258
RMSE: 0.5808872463801671
NRMSE: 0.31323295573195087


### Different models

In [11]:
from extendtr.utils.utils import stack_models
# load the descriptor - the indices of desc will be used later
desc = pd.read_parquet(f'./SampleDatasets/CHEMBL278/data_ECFP4.parquet', engine='fastparquet').astype('bool')
# define arg strs for different anchor selection methods
arg_strs_mdls = [f'-ensemble 1', 
                f'-model LR_L1 -ensemble 1',
                f'-model RF -ensemble 1', 
                f'-model ANN -ensemble 1'
                ]


########################### Train and get the prediction ###############################
preds_test = []
preds_val = []
for arg_str in arg_strs_dist:
    args = TopoRegArgs(f'{arg_str}')
    # set random seed
    set_seed(args.seed)
    mdl, pred_test, pred_val, train_time, test_time = TopoReg(desc, target, train_idx, test_idx, val_idx, args)
    # Stack the predicted responses for test and validation sets
    preds_test.append(pred_test)
    preds_val.append(pred_val)

########################### Combine the results ###############################
# ensemble
pred_test = np.array(preds_test).mean(axis=0)
# evaluation
print('Performance of ensemble predictions:')
scorr, r2, rmse, nrmse = metric_calc(pred_test, target.loc[test_idx], True)


# stacking the results
pred_test, train_time, test_time = stack_models(preds_val, preds_test, target, val_idx)
# evaluation
print('Performance of stacking predictions:')
scorr, r2, rmse, nrmse = metric_calc(pred_test, target.loc[test_idx], True)

Performance of ensemble predictions:
Spearman: 0.8250608014159587
R2: 0.8581550874070674
RMSE: 0.6984434917870883
NRMSE: 0.3766230377883603
Performance of stacking predictions:
Spearman: 0.8627928556748934
R2: 0.9018851154434258
RMSE: 0.5808872463801671
NRMSE: 0.31323295573195087
