## Preparation

#### Import Libraries

In [1]:
import torch
import numpy as np
import pandas as pd
from model_GNN import solvgnn_binary, solvgnn_ternary
from generate_dataset import solvent_dataset_binary, solvent_dataset_ternary
import matplotlib.pyplot as plt

Using backend: pytorch


#### Load trained models

In [2]:
model = solvgnn_binary(in_dim=74, hidden_dim=256, n_classes=2).cuda()

In [3]:
cv_param_list = []
for cv_id in range(5):
    cv_param_list.append('../saved_model/solvgnn_binary_final_model_cv{}.pth'.format(cv_id))

#### Initialize dataset class

In [4]:
dataset_path = '../data/output_binary_all.csv'
solvent_list_path = '../data/solvent_list.csv'
dataset = solvent_dataset_binary(input_file_path=dataset_path,
                                 solvent_list_path=solvent_list_path,
                                 generate_all=False)

In [5]:
_ = dataset.search_chemical("benzene")
_ = dataset.search_chemical("toluene")

solvent_262 BENZENE c1ccccc1
solvent_782 TOLUENE Cc1ccccc1


In [6]:
base_solv1_id = "solvent_262"
base_solv2_id = "solvent_782"
base_solv1_x = 0.5

In [7]:
# _ = dataset.get_all_predictions(base_solv1_id,model,cp_list)
# _ = dataset.get_all_predictions(base_solv2_id,model,cp_list)

In [8]:
output_cv = pd.read_csv("../counterfactual/output_cv_base_{}.csv".format(base_solv1_id))
filter_condition1 = (output_cv["solv1"]==base_solv1_id)|(output_cv["solv2"]==base_solv1_id)
filter_condition2 = (output_cv["solv1"]==base_solv2_id)|(output_cv["solv2"]==base_solv2_id)
filter_condition_or = filter_condition1 | filter_condition2
filter_condition_and = filter_condition1 & filter_condition2
base_sample = output_cv[filter_condition_and & (output_cv["solv1_x"]==base_solv1_x)]
pred_gam1 = base_sample["pred_gam1"].values[0]
pred_gam2 = base_sample["pred_gam2"].values[0]

In [9]:
sim_all = []
for i in range(len(output_cv)):
    smiles_list1 = [dataset.solvent_smiles[base_solv1_id],dataset.solvent_smiles[base_solv2_id]]
    smiles_list2 = [output_cv["solv1_smiles"].iloc[i],output_cv["solv2_smiles"].iloc[i]]
    _,sim = dataset.get_external_similarity(idx1=None,
                                            idx2=None,
                                            smiles_list1=smiles_list1,
                                            smiles_list2=smiles_list2)
    sim_all.append(sim)
    if (i + 1) % 500 == 0:
        print('{} out of {} done!'.format(i+1,len(output_cv)))
sim_all = np.array(sim_all)

500 out of 3500 done!
1000 out of 3500 done!
1500 out of 3500 done!
2000 out of 3500 done!
2500 out of 3500 done!
3000 out of 3500 done!
3500 out of 3500 done!


In [10]:
ae = np.array((np.abs(output_cv["pred_gam1"]-pred_gam1)+np.abs(output_cv["pred_gam2"]-pred_gam2))/2)
ae = ae/np.max(ae)
alpha = 0.9
cv_score = alpha*sim_all+(1-alpha)*ae
output_cv["sim_base2_{}".format(base_solv2_id)] = sim_all
output_cv["cf_score_base2_{}".format(base_solv2_id)] = cv_score
cf_idx1 = output_cv[filter_condition_and].sort_values(by="cf_score_base2_{}".format(base_solv2_id),ascending=False).iloc[0]
cf_idx2 = output_cv[filter_condition1 & ~filter_condition2].sort_values(by="cf_score_base2_{}".format(base_solv2_id),ascending=False).iloc[0]
cf_idx4 = output_cv[filter_condition1 & ~filter_condition2].sort_values(by="cf_score_base2_{}".format(base_solv2_id),ascending=True).iloc[0]

In [11]:
base_solv2_id = "solvent_262"
base_solv1_id = "solvent_782"
base_solv1_x = 0.5

In [12]:
output_cv = pd.read_csv("../counterfactual/output_cv_base_{}.csv".format(base_solv1_id))
filter_condition1 = (output_cv["solv1"]==base_solv1_id)|(output_cv["solv2"]==base_solv1_id)
filter_condition2 = (output_cv["solv1"]==base_solv2_id)|(output_cv["solv2"]==base_solv2_id)
filter_condition_or = filter_condition1 | filter_condition2
filter_condition_and = filter_condition1 & filter_condition2
base_sample = output_cv[filter_condition_and & (output_cv["solv1_x"]==base_solv1_x)]
pred_gam1 = base_sample["pred_gam1"].values[0]
pred_gam2 = base_sample["pred_gam2"].values[0]

In [13]:
sim_all = []
for i in range(len(output_cv)):
    smiles_list1 = [dataset.solvent_smiles[base_solv1_id],dataset.solvent_smiles[base_solv2_id]]
    smiles_list2 = [output_cv["solv1_smiles"].iloc[i],output_cv["solv2_smiles"].iloc[i]]
    _,sim = dataset.get_external_similarity(idx1=None,
                                            idx2=None,
                                            smiles_list1=smiles_list1,
                                            smiles_list2=smiles_list2)
    sim_all.append(sim)
    if (i + 1) % 500 == 0:
        print('{} out of {} done!'.format(i+1,len(output_cv)))
sim_all = np.array(sim_all)

500 out of 3500 done!
1000 out of 3500 done!
1500 out of 3500 done!
2000 out of 3500 done!
2500 out of 3500 done!
3000 out of 3500 done!
3500 out of 3500 done!


In [14]:
ae = np.array((np.abs(output_cv["pred_gam1"]-pred_gam1)+np.abs(output_cv["pred_gam2"]-pred_gam2))/2)
ae = ae/np.max(ae)
alpha = 0.9
cv_score = alpha*sim_all+(1-alpha)*ae
output_cv["sim_base2_{}".format(base_solv2_id)] = sim_all
output_cv["cf_score_base2_{}".format(base_solv2_id)] = cv_score
cf_idx3 = output_cv[filter_condition1 & ~filter_condition2].sort_values(by="cf_score_base2_{}".format(base_solv2_id),ascending=False).iloc[0]
cf_idx5 = output_cv[filter_condition1 & ~filter_condition2].sort_values(by="cf_score_base2_{}".format(base_solv2_id),ascending=True).iloc[0]

In [15]:
base_solv1_id = "solvent_262"
base_solv2_id = "solvent_782"
base_solv1_x = 0.5

In [16]:
output_cv = pd.read_csv("../counterfactual/output_cv_base_{}.csv".format(base_solv1_id))
filter_condition1 = (output_cv["solv1"]==base_solv1_id)|(output_cv["solv2"]==base_solv1_id)
filter_condition2 = (output_cv["solv1"]==base_solv2_id)|(output_cv["solv2"]==base_solv2_id)
filter_condition_or = filter_condition1 | filter_condition2
filter_condition_and = filter_condition1 & filter_condition2
base_sample = output_cv[filter_condition_and & (output_cv["solv1_x"]==base_solv1_x)]
pred_gam1 = base_sample["pred_gam1"].values[0]
pred_gam2 = base_sample["pred_gam2"].values[0]
output_cv = pd.read_csv('../analysis/solvgnn_output_cv_binary.csv')
output_cv = output_cv.sort_values(by="idx").reset_index()
output_cv["solv1_smiles"] = dataset.dataset["solv1_smiles"]
output_cv["solv2_smiles"] = dataset.dataset["solv2_smiles"]
output_cv["solv1_name"] = dataset.dataset["solv1_name"]
output_cv["solv2_name"] = dataset.dataset["solv2_name"]
output_cv["solv1"] = dataset.dataset["solv1"]
output_cv["solv2"] = dataset.dataset["solv2"]
filter_condition1 = (output_cv["solv1"]==base_solv1_id)|(output_cv["solv2"]==base_solv1_id)
filter_condition2 = (output_cv["solv1"]==base_solv2_id)|(output_cv["solv2"]==base_solv2_id)
filter_condition_or = filter_condition1 | filter_condition2
filter_condition_and = filter_condition1 & filter_condition2

In [17]:
sim_perm_all = []
sim_all = []
for sample_id in range(len(dataset)//5):
    smiles_list1 = [dataset.solvent_smiles[base_solv1_id],dataset.solvent_smiles[base_solv2_id]]
    smiles_list2 = [output_cv["solv1_smiles"].iloc[i],output_cv["solv2_smiles"].iloc[i]]
    sim_perm,sim = dataset.get_external_similarity(idx1=None,idx2=None,
                                                   smiles_list1=smiles_list1,smiles_list2=smiles_list2)
    sim_perm_all.append(sim_perm)
    sim_all.append(sim)
    if (sample_id + 1) % 2000 == 0:
        print('{} out of {} done!'.format(sample_id+1,len(dataset)//5))
sim_all = np.array(sim_all*5)
sim_perm_all = np.array(sim_perm_all*5)
ae = (np.abs(output_cv["pred_gam1"]-pred_gam1)+np.abs(output_cv["pred_gam2"]-pred_gam2))/2
ae_perm2 = (np.abs(output_cv["pred_gam2"]-pred_gam1)+np.abs(output_cv["pred_gam1"]-pred_gam2))/2
ae[sim_perm_all==1] = ae_perm2[sim_perm_all==1]
ae = ae/np.max(ae)
alpha = 0.9
cv_score = alpha*sim_all+(1-alpha)*ae
output_cv["sim_base_{}_{}".format(base_solv1_id,base_solv2_id)] = sim_all
output_cv["cf_score_base_{}_{}".format(base_solv1_id,base_solv2_id)] = cv_score
cf_idx6 = output_cv[~filter_condition_or].sort_values(by="cf_score_base_{}_{}".format(base_solv1_id,base_solv2_id),
                                                      ascending=True).iloc[0]

2000 out of 40000 done!
4000 out of 40000 done!
6000 out of 40000 done!
8000 out of 40000 done!
10000 out of 40000 done!
12000 out of 40000 done!
14000 out of 40000 done!
16000 out of 40000 done!
18000 out of 40000 done!
20000 out of 40000 done!
22000 out of 40000 done!
24000 out of 40000 done!
26000 out of 40000 done!
28000 out of 40000 done!
30000 out of 40000 done!
32000 out of 40000 done!
34000 out of 40000 done!
36000 out of 40000 done!
38000 out of 40000 done!
40000 out of 40000 done!


In [18]:
print(cf_idx1)
print(cf_idx2)
print(cf_idx3)
print(cf_idx4)
print(cf_idx5)
print(cf_idx6)

solv1_smiles                     c1ccccc1
solv2_smiles                    Cc1ccccc1
solv1_name                        BENZENE
solv2_name                        TOLUENE
solv1                         solvent_262
solv2                         solvent_782
solv1_x                               0.1
pred_gam1                       -0.022158
pred_gam2                       -0.005008
sim_base2_solvent_782                 1.0
cf_score_base2_solvent_782       0.900463
Name: 3250, dtype: object
solv1_smiles                     c1ccccc1
solv2_smiles                    Oc1ccccc1
solv1_name                        BENZENE
solv2_name                         PHENOL
solv1                         solvent_262
solv2                         solvent_710
solv1_x                               0.9
pred_gam1                        0.019816
pred_gam2                         0.90866
sim_base2_solvent_782            0.684211
cf_score_base2_solvent_782       0.633519
Name: 2939, dtype: object
solv1_smiles            