In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from adaptor.utils.args import TRArgs
from adaptor.utils.utils import set_seed
from adaptor.TR.distance import calculate_distance_parallel
from adaptor.TR.topoReg import TopoReg, TopoReg_desc

## Run AdapToR with distances 

In [None]:
# load the data
path = '../example_datasets/CHEMBL278/'
# load the descriptor
desc = pd.read_parquet(f'{path}/data_mhfp6.parquet', engine='fastparquet')
# load targets
data = pd.read_csv(f'{path}/data_cp.csv', index_col=0)
target = data["pChEMBL Value"].to_frame()
# calculate the distances
# structure distances
distance_x = calculate_distance_parallel(desc, desc.index, desc.index, distance='mhfp_jaccard', cpus=1)
distance_x = pd.DataFrame(distance_x, index=desc.index, columns=desc.index) 
# response distances
distance_y = pairwise_distances(target.values.reshape(-1, 1), metric="euclidean", n_jobs=-1)
distance_y = pd.DataFrame(distance_y, index=target.index, columns=target.index)

# load the train and test indices
fold = 0
train_file = f"{path}/train_fold_{fold}.csv"
test_file = f"{path}/test_fold_{fold}.csv"
train_idx = pd.read_csv(train_file)['Compound_ID'].tolist()
test_idx = pd.read_csv(test_file)['Compound_ID'].tolist()
# make sure that train and test indices are included in target.index
train_idx = [idx for idx in train_idx if idx in target.index]
test_idx = [idx for idx in test_idx if idx in target.index]

# get the args
# to get the models: -save_mdls 1
args = TRArgs("""
    -num_anchors_y 10 
    -anchors_y_sel cluster 
    -num_anchors_x 0.15 
    -anchor_x_sel adaptive 
    -num_steps 4 
    -model LR_L2 
    -recon optimize
""")
# set random seed
set_seed(args.seed)

# train the AdapToR model and get the predictions
metrics, preds_test, preds_val, preds_val_stack, \
pred_test, anchors_idx_x_all, anchors_idx_y, models = TopoReg(
    distance_x, distance_y, target, 
    train_idx, test_idx, args
)


Final test performance:
Spearman: 0.9317471566657725
R2: 0.8692531426774518
RMSE: 0.5075985496236505
NRMSE: 0.3615893490170143
PCC: 0.9397836203757282
Bias: 0.12001183270567459


### Alternatively, we can use the function TopoReg_desc that takes descriptors and targtes as inputs instead of distances.

In [None]:
# train the AdapToR model and get the predictions
metrics, preds_test, preds_val, preds_val_stack, \
pred_test, anchors_idx_x_all, anchors_idx_y, models = TopoReg_desc(
    desc, target, train_idx, test_idx, args
)

Final test performance:
Spearman: 0.9305373572846873
R2: 0.8606351470937781
RMSE: 0.5240604355688506
NRMSE: 0.37331602283617826
PCC: 0.9358399595683664
Bias: 0.12423635992122747


### Example codes to save and load results

In [None]:
import os
from adaptor.utils.utils import save_results, load_results
# To save the results
output_suffix = f'CHEMBL278_fold_{fold}'
results_folder = f'../results/'
os.makedirs(results_folder, exist_ok=True)
indices = {
    "train_idx": train_idx,
    "test_idx": test_idx,
    "anchors_idx_x_all": anchors_idx_x_all,
    "anchors_idx_y": anchors_idx_y
}
save_results(results_folder, output_suffix, metrics, preds_test, preds_val, preds_val_stack, indices, models, args)
# save the metrics to a csv file
output_file = f'{results_folder}/test_metrics_results_{output_suffix}.csv'
pd.DataFrame(metrics).to_csv(output_file, index=False)

# To load the results
metrics, preds_test, preds_val, preds_val_stack, indices, models = load_results(results_folder, output_suffix)
# unpack the indices
train_idx = indices["train_idx"]
test_idx = indices["test_idx"]
anchors_idx_x_all = indices["anchors_idx_x_all"]
anchors_idx_y = indices["anchors_idx_y"]

## Ablation analysis of AdapToR

### Prepare data

In [9]:
# load the data
path = '../example_datasets/CHEMBL278/'
# load the descriptor
desc = pd.read_parquet(f'{path}/data_mhfp6.parquet', engine='fastparquet')
desc_ecfp4 = pd.read_parquet(f'{path}/data_ECFP4.parquet', engine='fastparquet').astype('bool')
# load targets
data = pd.read_csv(f'{path}/data_cp.csv', index_col=0)
target = data["pChEMBL Value"].to_frame()
# calculate the distances
# structure distances
distance_x = calculate_distance_parallel(desc, desc.index, desc.index, distance='mhfp_jaccard', cpus=1)
distance_x = pd.DataFrame(distance_x, index=desc.index, columns=desc.index) 
distance_x_ecfp4 = calculate_distance_parallel(desc_ecfp4, desc_ecfp4.index, desc_ecfp4.index, distance='jaccard', cpus=1)
distance_x_ecfp4 = pd.DataFrame(distance_x_ecfp4, index=desc_ecfp4.index, columns=desc_ecfp4.index) 
# response distances
distance_y = pairwise_distances(target.values.reshape(-1, 1), metric="euclidean", n_jobs=-1)
distance_y = pd.DataFrame(distance_y, index=target.index, columns=target.index)

# load the train and test indices
fold = 0
train_file = f"{path}/train_fold_{fold}.csv"
test_file = f"{path}/test_fold_{fold}.csv"
train_idx = pd.read_csv(train_file)['Compound_ID'].tolist()
test_idx = pd.read_csv(test_file)['Compound_ID'].tolist()
# make sure that train and test indices are included in target.index
train_idx = [idx for idx in train_idx if idx in target.index]
test_idx = [idx for idx in test_idx if idx in target.index]

### TR

In [None]:
# get the args
args = TRArgs("""
    -anchors_y_sel same 
    -num_anchors_x 0.6 
    -anchor_x_sel random 
    -num_steps 1 
    -model LR 
    -recon rbf
""")
# set random seed
set_seed(args.seed)

# train the AdapToR model and get the predictions
metrics, preds_test, preds_val, preds_val_stack, \
pred_test, anchors_idx_x_all, anchors_idx_y, models = TopoReg(
    distance_x_ecfp4, distance_y, target, 
    train_idx, test_idx, args
)


Final test performance:
Spearman: 0.9136001659494947
R2: 0.8440375480656079
RMSE: 0.5543892114660223
NRMSE: 0.3949208172968247
PCC: 0.9265127809519669
Bias: 0.1631237998341933


### TR (L2)

In [11]:
# get the args
args = TRArgs("""
    -anchors_y_sel same 
    -num_anchors_x 0.6 
    -anchor_x_sel random 
    -num_steps 1 
    -model LR_L2 
    -recon rbf
""")
# set random seed
set_seed(args.seed)

# train the AdapToR model and get the predictions
metrics, preds_test, preds_val, preds_val_stack, \
pred_test, anchors_idx_x_all, anchors_idx_y, models = TopoReg(
    distance_x_ecfp4, distance_y, target, 
    train_idx, test_idx, args
)

Final test performance:
Spearman: 0.9065430028931645
R2: 0.8518942790444715
RMSE: 0.5402448808597815
NRMSE: 0.38484506097328114
PCC: 0.9295958442514846
Bias: 0.1871990209363205


### TR (L2, MHFP)

In [12]:
# get the args
args = TRArgs("""
    -anchors_y_sel same 
    -num_anchors_x 0.6 
    -anchor_x_sel random 
    -num_steps 1 
    -model LR_L2 
    -recon rbf
""")
# set random seed
set_seed(args.seed)

# train the AdapToR model and get the predictions
metrics, preds_test, preds_val, preds_val_stack, \
pred_test, anchors_idx_x_all, anchors_idx_y, models = TopoReg(
    distance_x, distance_y, target, 
    train_idx, test_idx, args
)

Final test performance:
Spearman: 0.916019764711665
R2: 0.8374217961801574
RMSE: 0.5660253783168819
NRMSE: 0.403209875647711
PCC: 0.9255661362517185
Bias: 0.17705609201683814


### TR (Improved)

In [13]:
# get the args
args = TRArgs("""
    -num_anchors_y 10 
    -anchors_y_sel cluster 
    -num_anchors_x 0.6 
    -anchor_x_sel random 
    -num_steps 1 
    -model LR_L2 
    -recon rbf
""")
# set random seed
set_seed(args.seed)

# train the AdapToR model and get the predictions
metrics, preds_test, preds_val, preds_val_stack, \
pred_test, anchors_idx_x_all, anchors_idx_y, models = TopoReg(
    distance_x, distance_y, target, 
    train_idx, test_idx, args
)

Final test performance:
Spearman: 0.9148099653305798
R2: 0.8302463392005774
RMSE: 0.5783813990977283
NRMSE: 0.41201172410432996
PCC: 0.9228551294932994
Bias: 0.18894127878056055


### AdapToR (RBF)

In [14]:
# get the args
args = TRArgs("""
    -num_anchors_y 10 
    -anchors_y_sel cluster 
    -num_anchors_x 0.15 
    -anchor_x_sel adaptive 
    -num_steps 4 
    -model LR_L2 
    -recon rbf
""")
# set random seed
set_seed(args.seed)

# train the AdapToR model and get the predictions
metrics, preds_test, preds_val, preds_val_stack, \
pred_test, anchors_idx_x_all, anchors_idx_y, models = TopoReg(
    distance_x, distance_y, target, 
    train_idx, test_idx, args
)

Final test performance:
Spearman: 0.9269079591414315
R2: 0.8487832660564587
RMSE: 0.5458894108500031
NRMSE: 0.38886595883869957
PCC: 0.9295916579425507
Bias: 0.1844702913070261


### AdapToR

In [15]:
# get the args
args = TRArgs("""
    -num_anchors_y 10 
    -anchors_y_sel cluster 
    -num_anchors_x 0.15 
    -anchor_x_sel adaptive 
    -num_steps 4 
    -model LR_L2 
    -recon optimize
""")
# set random seed
set_seed(args.seed)

# train the AdapToR model and get the predictions
metrics, preds_test, preds_val, preds_val_stack, \
pred_test, anchors_idx_x_all, anchors_idx_y, models = TopoReg(
    distance_x, distance_y, target, 
    train_idx, test_idx, args
)

Final test performance:
Spearman: 0.9317471566657725
R2: 0.8692531426774518
RMSE: 0.5075985496236505
NRMSE: 0.3615893490170143
PCC: 0.9397836203757282
Bias: 0.12001183270567459


## Incorporating the ensemble and stakcing approaches

### Prepare data

In [3]:
# load the data
path = '../example_datasets/CHEMBL278/'
# load the descriptor
desc = pd.read_parquet(f'{path}/data_mhfp6.parquet', engine='fastparquet')
desc_ecfp4 = pd.read_parquet(f'{path}/data_ECFP4.parquet', engine='fastparquet').astype('bool')
# load targets
data = pd.read_csv(f'{path}/data_cp.csv', index_col=0)
target = data["pChEMBL Value"].to_frame()
# calculate the distances
# structure distances
distance_x = calculate_distance_parallel(desc, desc.index, desc.index, distance='mhfp_jaccard', cpus=1)
distance_x = pd.DataFrame(distance_x, index=desc.index, columns=desc.index) 
distance_x_ecfp4 = calculate_distance_parallel(desc_ecfp4, desc_ecfp4.index, desc_ecfp4.index, distance='jaccard', cpus=1)
distance_x_ecfp4 = pd.DataFrame(distance_x_ecfp4, index=desc_ecfp4.index, columns=desc_ecfp4.index) 
# response distances
distance_y = pairwise_distances(target.values.reshape(-1, 1), metric="euclidean", n_jobs=-1)
distance_y = pd.DataFrame(distance_y, index=target.index, columns=target.index)

# load the train and test indices
fold = 0
train_file = f"{path}/train_fold_{fold}.csv"
test_file = f"{path}/test_fold_{fold}.csv"
train_idx = pd.read_csv(train_file)['Compound_ID'].tolist()
test_idx = pd.read_csv(test_file)['Compound_ID'].tolist()
# make sure that train and test indices are included in target.index
train_idx = [idx for idx in train_idx if idx in target.index]
test_idx = [idx for idx in test_idx if idx in target.index]

### Ensemble TR (original)

In [20]:
# get the args
args = TRArgs("""
    -anchors_y_sel same
    -anchor_x_sel random
    -random_anchor_perc 1
    -num_steps 15 
    -model LR 
    -recon rbf
    -integrate_method ensemble
    -append_anchors 0
""")
# set random seed
set_seed(args.seed)

# train the AdapToR model and get the predictions
metrics, preds_test, preds_val, preds_val_stack, \
pred_test, anchors_idx_x_all, anchors_idx_y, models = TopoReg(
    distance_x_ecfp4, distance_y, target, 
    train_idx, test_idx, args
)

Final test performance:
Spearman: 0.9277144920621552
R2: 0.8650376655356253
RMSE: 0.5157165087431721
NRMSE: 0.3673722015400385
PCC: 0.9399210032683176
Bias: 0.20404032734334132


### Ensemble TR (enhanced)

In [21]:
# get the args
args = TRArgs("""
    -num_anchors_y 10 
    -anchors_y_sel cluster
    -anchor_x_sel random
    -random_anchor_perc 1
    -num_steps 15 
    -model LR_L2 
    -recon optimize
    -integrate_method ensemble
    -append_anchors 0
""")
# set random seed
set_seed(args.seed)

# train the AdapToR model and get the predictions
metrics, preds_test, preds_val, preds_val_stack, \
pred_test, anchors_idx_x_all, anchors_idx_y, models = TopoReg(
    distance_x, distance_y, target, 
    train_idx, test_idx, args
)

Final test performance:
Spearman: 0.9148099653305798
R2: 0.8469449692401291
RMSE: 0.5491974948992563
NRMSE: 0.39122248243150715
PCC: 0.9292843002451947
Bias: 0.14640650354600748


### Stack TR (enhanced)

In [4]:
# get the args
args = TRArgs("""
    -num_anchors_y 10 
    -anchors_y_sel cluster
    -anchor_x_sel random
    -random_anchor_perc 1
    -num_steps 15 
    -model LR_L2 
    -recon optimize
    -integrate_method stack
    -append_anchors 0
""")
# set random seed
set_seed(args.seed)

# train the AdapToR model and get the predictions
metrics, preds_test, preds_val, preds_val_stack, \
pred_test, anchors_idx_x_all, anchors_idx_y, models = TopoReg(
    distance_x, distance_y, target, 
    train_idx, test_idx, args
)

Final test performance:
Spearman: 0.16090331768432944
R2: -47.5994561341216
RMSE: 9.786343326397201
NRMSE: 6.971331015962562
PCC: 0.16360253273241626
Bias: -0.012402375314020324


### AdapToR (ensemble)

In [5]:
# get the args
args = TRArgs("""
    -num_anchors_y 10 
    -anchors_y_sel cluster
    -num_anchors_x 0.15          
    -anchor_x_sel adaptive
    -num_steps 4
    -model LR_L2 
    -recon optimize
    -integrate_method ensemble
""")
# set random seed
set_seed(args.seed)

# train the AdapToR model and get the predictions
metrics, preds_test, preds_val, preds_val_stack, \
pred_test, anchors_idx_x_all, anchors_idx_y, models = TopoReg(
    distance_x, distance_y, target, 
    train_idx, test_idx, args
)

Final test performance:
Spearman: 0.9109789339571435
R2: 0.8525450778932087
RMSE: 0.5390566153733726
NRMSE: 0.383998596490653
PCC: 0.9330456864807818
Bias: 0.1367946336018341


### AdapToR (stack)

In [6]:
# get the args
args = TRArgs("""
    -num_anchors_y 10 
    -anchors_y_sel cluster
    -num_anchors_x 0.15          
    -anchor_x_sel adaptive
    -num_steps 4
    -model LR_L2 
    -recon optimize
    -integrate_method stack
""")
# set random seed
set_seed(args.seed)

# train the AdapToR model and get the predictions
metrics, preds_test, preds_val, preds_val_stack, \
pred_test, anchors_idx_x_all, anchors_idx_y, models = TopoReg(
    distance_x, distance_y, target, 
    train_idx, test_idx, args
)

Final test performance:
Spearman: 0.6958362773541616
R2: 0.4281309345771601
RMSE: 1.0615807760230336
NRMSE: 0.7562202492811468
PCC: 0.6920227678485582
Bias: 0.4220483115345252
